diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f50aca0797cb2d3a2b61796d43137fb417f76ee..584debd75676962df90c73b941fa14d6ed1e641f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 IF (CMAKE_VERSION VERSION_LESS 3.0)
   PROJECT(TDengine CXX)
   SET(PROJECT_VERSION_MAJOR "${LIB_MAJOR_VERSION}")
-  SET(PROJECT_VERSION_MINOR "${LIB_MINOR_VERSION}")
+  SET(PROJECT_VERSION_MINOR "${:_MINOR_VERSION}")
   SET(PROJECT_VERSION_PATCH "${LIB_PATCH_VERSION}")
   SET(PROJECT_VERSION "${LIB_VERSION_STRING}")
 ELSE ()
@@ -43,11 +43,14 @@ INCLUDE(cmake/version.inc)
 INCLUDE(cmake/install.inc)
 
 IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
-  SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -pipe -Wall -Wshadow -Werror")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -Wall -Wshadow -Werror")
+  SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -pipe -Wall ")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -Wall")
 ENDIF ()
 MESSAGE(STATUS "CMAKE_C_FLAGS:   ${CMAKE_C_FLAGS}")
 MESSAGE(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+MESSAGE(STATUS "COMMON_FLAGS:    ${COMMON_FLAGS}")
+
+
 
 ADD_SUBDIRECTORY(deps)
 ADD_SUBDIRECTORY(src)
diff --git a/cmake/define.inc b/cmake/define.inc
index a15e0aecbb2d30ad2ec7aa1c5761c9d2a40f3323..26c12aeb7660f8364fd826fcd9a6c161c26964a0 100755
--- a/cmake/define.inc
+++ b/cmake/define.inc
@@ -57,7 +57,7 @@ IF (TD_LINUX_64)
   ADD_DEFINITIONS(-D_M_X64)
   ADD_DEFINITIONS(-D_TD_LINUX_64)
   MESSAGE(STATUS "linux64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -gdwarf-2 -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall -fPIC -gdwarf-2 -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
   ADD_DEFINITIONS(-DUSE_LIBICONV)
 
   IF (JEMALLOC_ENABLED)
@@ -70,7 +70,7 @@ IF (TD_LINUX_32)
   ADD_DEFINITIONS(-D_TD_LINUX_32)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "linux32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -munaligned-access -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -munaligned-access -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()
 
 IF (TD_ARM_64)
@@ -78,7 +78,7 @@ IF (TD_ARM_64)
   ADD_DEFINITIONS(-D_TD_ARM_)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "arm64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()
 
 IF (TD_ARM_32)
@@ -86,7 +86,7 @@ IF (TD_ARM_32)
   ADD_DEFINITIONS(-D_TD_ARM_)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "arm32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-incompatible-pointer-types ")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-incompatible-pointer-types ")
 ENDIF ()
 
 IF (TD_MIPS_64)
@@ -94,7 +94,7 @@ IF (TD_MIPS_64)
   ADD_DEFINITIONS(-D_TD_MIPS_64)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "mips64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()
 
 IF (TD_MIPS_32)
@@ -102,7 +102,7 @@ IF (TD_MIPS_32)
   ADD_DEFINITIONS(-D_TD_MIPS_32)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "mips32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()
 
 IF (TD_APLHINE)
@@ -147,7 +147,7 @@ IF (TD_DARWIN_64)
   ADD_DEFINITIONS(-D_REENTRANT -D__USE_POSIX -D_LIBC_REENTRANT)
   ADD_DEFINITIONS(-DUSE_LIBICONV)
   MESSAGE(STATUS "darwin64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -Wno-missing-braces -fPIC -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -Wno-missing-braces -fPIC -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
   IF (TD_MEMORY_SANITIZER)
     SET(DEBUG_FLAGS "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment -O0 -g3 -DDEBUG")
   ELSE ()
diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt
index 99152c6ce365768b3b782809cca5aacbec1ef7fd..c6d186dd1fc17d2a73b95ad64171a3d2070c3fba 100644
--- a/deps/CMakeLists.txt
+++ b/deps/CMakeLists.txt
@@ -10,6 +10,8 @@ ADD_SUBDIRECTORY(cJson)
 ADD_SUBDIRECTORY(wepoll)
 ADD_SUBDIRECTORY(MsvcLibX)
 ADD_SUBDIRECTORY(rmonotonic)
+ADD_SUBDIRECTORY(SZ)
+
 
 IF (TD_LINUX AND TD_MQTT)
   ADD_SUBDIRECTORY(MQTT-C)
diff --git a/deps/SZ/.dockerignore b/deps/SZ/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..a31aedb8b96ec0b35b11793897a2e845ec1e3bb9
--- /dev/null
+++ b/deps/SZ/.dockerignore
@@ -0,0 +1 @@
+build.*
diff --git a/deps/SZ/.gitignore b/deps/SZ/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c20ca922a33b81c8ef63aa0e1d5af134be5cdea3
--- /dev/null
+++ b/deps/SZ/.gitignore
@@ -0,0 +1,6 @@
+build
+compile_commands.json
+tags
+CMakeCache.txt
+cmake-build-debug/
+CMakeFiles/
diff --git a/deps/SZ/.travis.yml b/deps/SZ/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0f058c74b20980adbeab1dbb87149b278957fdc7
--- /dev/null
+++ b/deps/SZ/.travis.yml
@@ -0,0 +1,45 @@
+sudo: false
+
+language: c
+
+before_install:
+- cd test/travis-ci && ./getData.sh && cd -
+
+matrix:
+  include:
+    - dist: xenial
+      os: linux
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test     # For gcc 4.9, 5 and 7
+          packages:
+            - gcc-7
+            - gfortran-7
+            - zstd
+            - libzstd1-dev
+            - exuberant-ctags
+            - libcunit1-dev 
+            - libnetcdf-dev 
+    - osx_image: xcode11
+      os: osx
+      env: PATH=/usr/local/bin:$PATH
+install:
+    - mkdir build
+    - cd build
+    - |
+      if [[ "${TRAVIS_OS_NAME}" != "linux" ]]; then
+        brew install ctags
+        brew install cunit
+        brew upgrade pkg-config
+      fi
+    - cmake -DCMAKE_INSTALL_PREFIX=$HOME -DBUILD_TESTS=ON -DBUILD_INTEGRATION_TESTS=ON ..
+    - make 
+    - make install
+    - make test
+
+script:
+- cd ..
+- ./configure && make
+- cd example && ./test.sh && cd -
+- cd test/travis-ci && ./test.sh && cd -
diff --git a/deps/SZ/CMakeLists.txt b/deps/SZ/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a265ad1728e0b8166a11a962657434cc4f0c7de7
--- /dev/null
+++ b/deps/SZ/CMakeLists.txt
@@ -0,0 +1,23 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+PROJECT(TDengine)
+
+# include
+INCLUDE_DIRECTORIES(sz/include)
+INCLUDE_DIRECTORIES(zlib/)
+INCLUDE_DIRECTORIES(zstd/)
+
+# source
+AUX_SOURCE_DIRECTORY(sz/src           SRC1)
+AUX_SOURCE_DIRECTORY(zlib/            SRC2)
+AUX_SOURCE_DIRECTORY(zstd/common      SRC3)
+AUX_SOURCE_DIRECTORY(zstd/compress    SRC4)
+AUX_SOURCE_DIRECTORY(zstd/decompress  SRC5)
+AUX_SOURCE_DIRECTORY(zstd/deprecated  SRC6)
+AUX_SOURCE_DIRECTORY(zstd/legacy      SRC7)
+AUX_SOURCE_DIRECTORY(zstd/dictBuilder SRC8)
+
+# archive
+ADD_LIBRARY(SZ STATIC ${SRC1} ${SRC2} ${SRC3} ${SRC4} ${SRC5} ${SRC6} ${SRC7} ${SRC8})
+
+
+
diff --git a/deps/SZ/sz/CMakeLists.txt b/deps/SZ/sz/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c5a9f909922da8221ecaed37707c5fd90a8aaf0
--- /dev/null
+++ b/deps/SZ/sz/CMakeLists.txt
@@ -0,0 +1,91 @@
+add_library (SZ
+  src/ArithmeticCoding.c
+  src/ByteToolkit.c
+  src/CacheTable.c
+  src/callZlib.c
+  src/CompressElement.c
+  src/conf.c
+  src/dataCompression.c
+  src/dictionary.c
+  src/DynamicByteArray.c
+  src/DynamicDoubleArray.c
+  src/DynamicFloatArray.c
+  src/DynamicIntArray.c
+  src/Huffman.c
+  src/iniparser.c
+  src/MultiLevelCacheTable.c
+  src/MultiLevelCacheTableWideInterval.c
+  src/pastri.c
+  src/exafelSZ.c
+  src/rw.c
+  src/rwf.c
+  src/sz.c
+  src/szd_double.c
+  src/szd_double_pwr.c
+  src/szd_double_ts.c
+  src/szd_float.c
+  src/szd_float_pwr.c
+  src/szd_float_ts.c
+  src/szd_int16.c
+  src/szd_int32.c
+  src/szd_int64.c
+  src/szd_int8.c
+  src/sz_double.c
+  src/sz_double_pwr.c
+  src/sz_double_ts.c
+  src/szd_uint16.c
+  src/szd_uint32.c
+  src/szd_uint64.c
+  src/szd_uint8.c
+  src/szf.c
+  src/sz_float.c
+  src/sz_float_pwr.c
+  src/sz_float_ts.c
+  src/sz_int16.c
+  src/sz_int32.c
+  src/sz_int64.c
+  src/sz_int8.c
+  src/sz_omp.c
+  src/sz_uint16.c
+  src/sz_uint32.c
+  src/sz_uint64.c
+  src/sz_uint8.c
+  src/TightDataPointStorageD.c
+  src/TightDataPointStorageF.c
+  src/TightDataPointStorageI.c
+  src/TypeManager.c
+  src/utility.c
+  src/VarSet.c
+  src/sz_stats.c
+)
+
+target_include_directories(SZ 
+  PUBLIC 
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/sz>
+  )
+
+
+target_compile_options(SZ
+	PRIVATE $<$<CONFIG:Debug>:-Wall -Wextra -Wpedantic -Wno-unused-parameter>
+	)
+
+if(BUILD_PASTRI)
+  target_compile_definitions(SZ PUBLIC HAVE_PASTRI)
+endif()
+if(BUILD_TIMECMPR)
+  target_compile_definitions(SZ PUBLIC HAVE_TIMECMPR)
+endif()
+if(BUILD_RANDOMACCESS)
+  target_compile_definitions(SZ PUBLIC HAVE_RANDOMACCESS)
+endif()
+if(BUILD_FORTRAN)
+  enable_language(Fortran)
+  target_sources(SZ PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/rw_interface.F90
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/sz_interface.F90
+  )
+endif()
+if(BUILD_STATS)
+  target_compile_definitions(SZ PUBLIC HAVE_WRITESTATS)
+endif()
diff --git a/deps/SZ/sz/Makefile.am b/deps/SZ/sz/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..fa784116589529632fb9e8f1f8bd1e68dd64a222
--- /dev/null
+++ b/deps/SZ/sz/Makefile.am
@@ -0,0 +1,93 @@
+#AM_CFLAGS = -I./include -I../zlib
+#LDFLAGS=-fPIC -shared
+
+AUTOMAKE_OPTIONS=foreign
+if FORTRAN
+include_HEADERS=include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h include/szf.h\
+		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h sz.mod rw.mod
+lib_LTLIBRARIES=libSZ.la
+libSZ_la_CFLAGS=-I./include -I../zlib/ -I../zstd/
+if TIMECMPR
+libSZ_la_CFLAGS+=-DHAVE_TIMECMPR
+endif
+if RANDOMACCESS
+libSZ_la_CFLAGS+=-DHAVE_RANDOMACCESS
+endif
+if OPENMP
+libSZ_la_CFLAGS+=-fopenmp
+endif
+libSZ_la_LDFLAGS = -version-info  2:1:0
+libSZ_la_LIDADD=../zlib/.libs/libzlib.a ../zstd/.libs/libzstd.a
+libSZ_la_SOURCES=src/MultiLevelCacheTable.c src/MultiLevelCacheTableWideInterval.c \
+		src/ByteToolkit.c src/dataCompression.c src/DynamicIntArray.c src/iniparser.c src/szf.c \
+		src/CompressElement.c src/DynamicByteArray.c src/rw.c src/utility.c\
+		src/TightDataPointStorageI.c src/TightDataPointStorageD.c src/TightDataPointStorageF.c \
+		src/conf.c src/DynamicDoubleArray.c src/rwf.c src/TypeManager.c \
+		src/dictionary.c src/DynamicFloatArray.c src/VarSet.c src/callZlib.c src/Huffman.c \
+		src/sz_float.c src/sz_double.c src/sz_int8.c src/sz_int16.c src/sz_int32.c src/sz_int64.c\
+		src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c src/szd_uint8.c src/szd_uint16.c src/szd_uint32.c src/szd_uint64.c\
+		src/szd_float.c src/szd_double.c src/szd_int8.c src/szd_int16.c src/szd_int32.c src/szd_int64.c src/sz.c\
+		src/sz_float_pwr.c src/sz_double_pwr.c src/szd_float_pwr.c src/szd_double_pwr.c src/ArithmeticCoding.c src/CacheTable.c\
+		src/sz_interface.F90 src/rw_interface.F90 src/exafelSZ.c
+libSZ_la_LINK=$(AM_V_CC)$(LIBTOOL) --tag=FC --mode=link $(FCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+else
+include_HEADERS=include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h\
+		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h
+
+lib_LTLIBRARIES=libSZ.la
+libSZ_la_CFLAGS=-I./include -I../zlib -I../zstd/ 
+if WRITESTATS
+libSZ_la_CFLAGS+=-DHAVE_WRITESTATS
+endif
+if TIMECMPR
+libSZ_la_CFLAGS+=-DHAVE_TIMECMPR
+endif
+if RANDOMACCESS
+libSZ_la_CFLAGS+=-DHAVE_RANDOMACCESS
+endif
+if OPENMP
+libSZ_la_CFLAGS+=-fopenmp
+endif
+libSZ_la_LDFLAGS = -version-info  1:4:0
+libSZ_la_LIDADD=../zlib/.libs/libzlib.a ../zlib/.libs/libzstd.a
+libSZ_la_SOURCES=src/MultiLevelCacheTable.c src/MultiLevelCacheTableWideInterval.c \
+		src/ByteToolkit.c src/dataCompression.c src/DynamicIntArray.c src/iniparser.c\
+		src/CompressElement.c src/DynamicByteArray.c src/rw.c src/utility.c\
+		src/TightDataPointStorageI.c src/TightDataPointStorageD.c src/TightDataPointStorageF.c \
+		src/conf.c src/DynamicDoubleArray.c src/TypeManager.c \
+		src/dictionary.c src/DynamicFloatArray.c src/VarSet.c src/callZlib.c src/Huffman.c \
+		src/sz_float.c src/sz_double.c src/sz_int8.c src/sz_int16.c src/sz_int32.c src/sz_int64.c\
+		src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c src/szd_uint8.c src/szd_uint16.c src/szd_uint32.c src/szd_uint64.c\
+		src/szd_float.c src/szd_double.c src/szd_int8.c src/szd_int16.c src/szd_int32.c src/szd_int64.c src/sz.c\
+		src/sz_float_pwr.c src/sz_double_pwr.c src/szd_float_pwr.c src/szd_double_pwr.c src/ArithmeticCoding.c src/exafelSZ.c src/CacheTable.c
+if PASTRI
+libSZ_la_SOURCES+=src/pastri.c
+endif
+if OPENMP
+libSZ_la_SOURCES+=src/sz_omp.c
+endif
+if TIMECMPR
+libSZ_la_SOURCES+=src/sz_float_ts.c src/szd_float_ts.c src/sz_double_ts.c src/szd_double_ts.c
+endif
+if WRITESTATS
+libSZ_la_SOURCES+=src/sz_stats.c
+endif
+
+libSZ_la_LINK= $(AM_V_CC)$(LIBTOOL) --tag=CC --mode=link $(CCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+endif
diff --git a/deps/SZ/sz/Makefile.in b/deps/SZ/sz/Makefile.in
new file mode 100644
index 0000000000000000000000000000000000000000..8209df4a6f40865ef987ce9c0b64710ebfd9e98f
--- /dev/null
+++ b/deps/SZ/sz/Makefile.in
@@ -0,0 +1,1729 @@
+# Makefile.in generated by automake 1.16.2 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2020 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+#AM_CFLAGS = -I./include -I../zlib
+#LDFLAGS=-fPIC -shared
+
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+@FORTRAN_TRUE@@TIMECMPR_TRUE@am__append_1 = -DHAVE_TIMECMPR
+@FORTRAN_TRUE@@RANDOMACCESS_TRUE@am__append_2 = -DHAVE_RANDOMACCESS
+@FORTRAN_TRUE@@OPENMP_TRUE@am__append_3 = -fopenmp
+@FORTRAN_FALSE@@WRITESTATS_TRUE@am__append_4 = -DHAVE_WRITESTATS
+@FORTRAN_FALSE@@TIMECMPR_TRUE@am__append_5 = -DHAVE_TIMECMPR
+@FORTRAN_FALSE@@RANDOMACCESS_TRUE@am__append_6 = -DHAVE_RANDOMACCESS
+@FORTRAN_FALSE@@OPENMP_TRUE@am__append_7 = -fopenmp
+@FORTRAN_FALSE@@PASTRI_TRUE@am__append_8 = src/pastri.c
+@FORTRAN_FALSE@@OPENMP_TRUE@am__append_9 = src/sz_omp.c
+@FORTRAN_FALSE@@TIMECMPR_TRUE@am__append_10 = src/sz_float_ts.c src/szd_float_ts.c src/sz_double_ts.c src/szd_double_ts.c
+@FORTRAN_FALSE@@WRITESTATS_TRUE@am__append_11 = src/sz_stats.c
+subdir = sz
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__include_HEADERS_DIST) \
+	$(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libSZ_la_LIBADD =
+am__libSZ_la_SOURCES_DIST = src/MultiLevelCacheTable.c \
+	src/MultiLevelCacheTableWideInterval.c src/ByteToolkit.c \
+	src/dataCompression.c src/DynamicIntArray.c src/iniparser.c \
+	src/CompressElement.c src/DynamicByteArray.c src/rw.c \
+	src/utility.c src/TightDataPointStorageI.c \
+	src/TightDataPointStorageD.c src/TightDataPointStorageF.c \
+	src/conf.c src/DynamicDoubleArray.c src/TypeManager.c \
+	src/dictionary.c src/DynamicFloatArray.c src/VarSet.c \
+	src/callZlib.c src/Huffman.c src/sz_float.c src/sz_double.c \
+	src/sz_int8.c src/sz_int16.c src/sz_int32.c src/sz_int64.c \
+	src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c \
+	src/szd_uint8.c src/szd_uint16.c src/szd_uint32.c \
+	src/szd_uint64.c src/szd_float.c src/szd_double.c \
+	src/szd_int8.c src/szd_int16.c src/szd_int32.c src/szd_int64.c \
+	src/sz.c src/sz_float_pwr.c src/sz_double_pwr.c \
+	src/szd_float_pwr.c src/szd_double_pwr.c \
+	src/ArithmeticCoding.c src/exafelSZ.c src/CacheTable.c \
+	src/pastri.c src/sz_omp.c src/sz_float_ts.c src/szd_float_ts.c \
+	src/sz_double_ts.c src/szd_double_ts.c src/sz_stats.c \
+	src/szf.c src/rwf.c src/sz_interface.F90 src/rw_interface.F90
+am__dirstamp = $(am__leading_dot)dirstamp
+@FORTRAN_FALSE@@PASTRI_TRUE@am__objects_1 = src/libSZ_la-pastri.lo
+@FORTRAN_FALSE@@OPENMP_TRUE@am__objects_2 = src/libSZ_la-sz_omp.lo
+@FORTRAN_FALSE@@TIMECMPR_TRUE@am__objects_3 =  \
+@FORTRAN_FALSE@@TIMECMPR_TRUE@	src/libSZ_la-sz_float_ts.lo \
+@FORTRAN_FALSE@@TIMECMPR_TRUE@	src/libSZ_la-szd_float_ts.lo \
+@FORTRAN_FALSE@@TIMECMPR_TRUE@	src/libSZ_la-sz_double_ts.lo \
+@FORTRAN_FALSE@@TIMECMPR_TRUE@	src/libSZ_la-szd_double_ts.lo
+@FORTRAN_FALSE@@WRITESTATS_TRUE@am__objects_4 =  \
+@FORTRAN_FALSE@@WRITESTATS_TRUE@	src/libSZ_la-sz_stats.lo
+@FORTRAN_FALSE@am_libSZ_la_OBJECTS =  \
+@FORTRAN_FALSE@	src/libSZ_la-MultiLevelCacheTable.lo \
+@FORTRAN_FALSE@	src/libSZ_la-MultiLevelCacheTableWideInterval.lo \
+@FORTRAN_FALSE@	src/libSZ_la-ByteToolkit.lo \
+@FORTRAN_FALSE@	src/libSZ_la-dataCompression.lo \
+@FORTRAN_FALSE@	src/libSZ_la-DynamicIntArray.lo \
+@FORTRAN_FALSE@	src/libSZ_la-iniparser.lo \
+@FORTRAN_FALSE@	src/libSZ_la-CompressElement.lo \
+@FORTRAN_FALSE@	src/libSZ_la-DynamicByteArray.lo \
+@FORTRAN_FALSE@	src/libSZ_la-rw.lo src/libSZ_la-utility.lo \
+@FORTRAN_FALSE@	src/libSZ_la-TightDataPointStorageI.lo \
+@FORTRAN_FALSE@	src/libSZ_la-TightDataPointStorageD.lo \
+@FORTRAN_FALSE@	src/libSZ_la-TightDataPointStorageF.lo \
+@FORTRAN_FALSE@	src/libSZ_la-conf.lo \
+@FORTRAN_FALSE@	src/libSZ_la-DynamicDoubleArray.lo \
+@FORTRAN_FALSE@	src/libSZ_la-TypeManager.lo \
+@FORTRAN_FALSE@	src/libSZ_la-dictionary.lo \
+@FORTRAN_FALSE@	src/libSZ_la-DynamicFloatArray.lo \
+@FORTRAN_FALSE@	src/libSZ_la-VarSet.lo src/libSZ_la-callZlib.lo \
+@FORTRAN_FALSE@	src/libSZ_la-Huffman.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_float.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_double.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_int8.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_int16.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_int32.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_int64.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_uint8.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_uint16.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_uint32.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_uint64.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_uint8.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_uint16.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_uint32.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_uint64.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_float.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_double.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_int8.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_int16.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_int32.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_int64.lo src/libSZ_la-sz.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_float_pwr.lo \
+@FORTRAN_FALSE@	src/libSZ_la-sz_double_pwr.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_float_pwr.lo \
+@FORTRAN_FALSE@	src/libSZ_la-szd_double_pwr.lo \
+@FORTRAN_FALSE@	src/libSZ_la-ArithmeticCoding.lo \
+@FORTRAN_FALSE@	src/libSZ_la-exafelSZ.lo \
+@FORTRAN_FALSE@	src/libSZ_la-CacheTable.lo $(am__objects_1) \
+@FORTRAN_FALSE@	$(am__objects_2) $(am__objects_3) \
+@FORTRAN_FALSE@	$(am__objects_4)
+@FORTRAN_TRUE@am_libSZ_la_OBJECTS =  \
+@FORTRAN_TRUE@	src/libSZ_la-MultiLevelCacheTable.lo \
+@FORTRAN_TRUE@	src/libSZ_la-MultiLevelCacheTableWideInterval.lo \
+@FORTRAN_TRUE@	src/libSZ_la-ByteToolkit.lo \
+@FORTRAN_TRUE@	src/libSZ_la-dataCompression.lo \
+@FORTRAN_TRUE@	src/libSZ_la-DynamicIntArray.lo \
+@FORTRAN_TRUE@	src/libSZ_la-iniparser.lo src/libSZ_la-szf.lo \
+@FORTRAN_TRUE@	src/libSZ_la-CompressElement.lo \
+@FORTRAN_TRUE@	src/libSZ_la-DynamicByteArray.lo \
+@FORTRAN_TRUE@	src/libSZ_la-rw.lo src/libSZ_la-utility.lo \
+@FORTRAN_TRUE@	src/libSZ_la-TightDataPointStorageI.lo \
+@FORTRAN_TRUE@	src/libSZ_la-TightDataPointStorageD.lo \
+@FORTRAN_TRUE@	src/libSZ_la-TightDataPointStorageF.lo \
+@FORTRAN_TRUE@	src/libSZ_la-conf.lo \
+@FORTRAN_TRUE@	src/libSZ_la-DynamicDoubleArray.lo \
+@FORTRAN_TRUE@	src/libSZ_la-rwf.lo src/libSZ_la-TypeManager.lo \
+@FORTRAN_TRUE@	src/libSZ_la-dictionary.lo \
+@FORTRAN_TRUE@	src/libSZ_la-DynamicFloatArray.lo \
+@FORTRAN_TRUE@	src/libSZ_la-VarSet.lo src/libSZ_la-callZlib.lo \
+@FORTRAN_TRUE@	src/libSZ_la-Huffman.lo src/libSZ_la-sz_float.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_double.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_int8.lo src/libSZ_la-sz_int16.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_int32.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_int64.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_uint8.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_uint16.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_uint32.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_uint64.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_uint8.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_uint16.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_uint32.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_uint64.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_float.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_double.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_int8.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_int16.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_int32.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_int64.lo src/libSZ_la-sz.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_float_pwr.lo \
+@FORTRAN_TRUE@	src/libSZ_la-sz_double_pwr.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_float_pwr.lo \
+@FORTRAN_TRUE@	src/libSZ_la-szd_double_pwr.lo \
+@FORTRAN_TRUE@	src/libSZ_la-ArithmeticCoding.lo \
+@FORTRAN_TRUE@	src/libSZ_la-CacheTable.lo src/sz_interface.lo \
+@FORTRAN_TRUE@	src/rw_interface.lo src/libSZ_la-exafelSZ.lo \
+@FORTRAN_TRUE@	$(am__objects_1) $(am__objects_2) \
+@FORTRAN_TRUE@	$(am__objects_3) $(am__objects_4)
+libSZ_la_OBJECTS = $(am_libSZ_la_OBJECTS)
+@FORTRAN_FALSE@am_libSZ_la_rpath = -rpath $(libdir)
+@FORTRAN_TRUE@am_libSZ_la_rpath = -rpath $(libdir)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Plo \
+	src/$(DEPDIR)/libSZ_la-ByteToolkit.Plo \
+	src/$(DEPDIR)/libSZ_la-CacheTable.Plo \
+	src/$(DEPDIR)/libSZ_la-CompressElement.Plo \
+	src/$(DEPDIR)/libSZ_la-DynamicByteArray.Plo \
+	src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Plo \
+	src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Plo \
+	src/$(DEPDIR)/libSZ_la-DynamicIntArray.Plo \
+	src/$(DEPDIR)/libSZ_la-Huffman.Plo \
+	src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Plo \
+	src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Plo \
+	src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Plo \
+	src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Plo \
+	src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Plo \
+	src/$(DEPDIR)/libSZ_la-TypeManager.Plo \
+	src/$(DEPDIR)/libSZ_la-VarSet.Plo \
+	src/$(DEPDIR)/libSZ_la-callZlib.Plo \
+	src/$(DEPDIR)/libSZ_la-conf.Plo \
+	src/$(DEPDIR)/libSZ_la-dataCompression.Plo \
+	src/$(DEPDIR)/libSZ_la-dictionary.Plo \
+	src/$(DEPDIR)/libSZ_la-exafelSZ.Plo \
+	src/$(DEPDIR)/libSZ_la-iniparser.Plo \
+	src/$(DEPDIR)/libSZ_la-pastri.Plo \
+	src/$(DEPDIR)/libSZ_la-rw.Plo src/$(DEPDIR)/libSZ_la-rwf.Plo \
+	src/$(DEPDIR)/libSZ_la-sz.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_double.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_double_pwr.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_double_ts.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_float.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_float_pwr.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_float_ts.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_int16.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_int32.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_int64.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_int8.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_omp.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_stats.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_uint16.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_uint32.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_uint64.Plo \
+	src/$(DEPDIR)/libSZ_la-sz_uint8.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_double.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_double_pwr.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_double_ts.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_float.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_float_pwr.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_float_ts.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_int16.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_int32.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_int64.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_int8.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_uint16.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_uint32.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_uint64.Plo \
+	src/$(DEPDIR)/libSZ_la-szd_uint8.Plo \
+	src/$(DEPDIR)/libSZ_la-szf.Plo \
+	src/$(DEPDIR)/libSZ_la-utility.Plo
+am__mv = mv -f
+PPFCCOMPILE = $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_FCFLAGS) $(FCFLAGS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+LTPPFCCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_FCFLAGS) $(FCFLAGS)
+AM_V_PPFC = $(am__v_PPFC_@AM_V@)
+am__v_PPFC_ = $(am__v_PPFC_@AM_DEFAULT_V@)
+am__v_PPFC_0 = @echo "  PPFC    " $@;
+am__v_PPFC_1 = 
+FCLD = $(FC)
+FCLINK = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_FCLD = $(am__v_FCLD_@AM_V@)
+am__v_FCLD_ = $(am__v_FCLD_@AM_DEFAULT_V@)
+am__v_FCLD_0 = @echo "  FCLD    " $@;
+am__v_FCLD_1 = 
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libSZ_la_SOURCES)
+DIST_SOURCES = $(am__libSZ_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__include_HEADERS_DIST = include/MultiLevelCacheTable.h \
+	include/MultiLevelCacheTableWideInterval.h \
+	include/CacheTable.h include/defines.h \
+	include/CompressElement.h include/DynamicDoubleArray.h \
+	include/rw.h include/conf.h include/dataCompression.h \
+	include/dictionary.h include/DynamicFloatArray.h \
+	include/VarSet.h include/sz.h include/Huffman.h \
+	include/ByteToolkit.h include/sz_float.h include/sz_double.h \
+	include/callZlib.h include/iniparser.h include/TypeManager.h \
+	include/sz_int8.h include/sz_int16.h include/sz_int32.h \
+	include/sz_int64.h include/szd_int8.h include/szd_int16.h \
+	include/szd_int32.h include/szd_int64.h include/sz_uint8.h \
+	include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h \
+	include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h \
+	include/szd_uint64.h include/sz_float_pwr.h \
+	include/sz_double_pwr.h include/szd_float.h \
+	include/szd_double.h include/szd_float_pwr.h \
+	include/szd_double_pwr.h include/sz_float_ts.h \
+	include/szd_float_ts.h include/sz_double_ts.h \
+	include/szd_double_ts.h include/utility.h include/sz_opencl.h \
+	include/DynamicByteArray.h include/DynamicIntArray.h \
+	include/TightDataPointStorageI.h \
+	include/TightDataPointStorageD.h \
+	include/TightDataPointStorageF.h include/pastriD.h \
+	include/pastriF.h include/pastriGeneral.h include/pastri.h \
+	include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h \
+	include/sz_stats.h include/szf.h sz.mod rw.mod
+HEADERS = $(include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FC = @FC@
+FCFLAGS = @FCFLAGS@
+FGREP = @FGREP@
+GREP = @GREP@
+GSL_CFLAGS = @GSL_CFLAGS@
+GSL_CONFIG = @GSL_CONFIG@
+GSL_HDR = @GSL_HDR@
+GSL_LIB = @GSL_LIB@
+GSL_LIBS = @GSL_LIBS@
+GSL_STATIC = @GSL_STATIC@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OPENMP_FLAGS = @OPENMP_FLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PASTRI_FLAGS = @PASTRI_FLAGS@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANDOMACCESS_FLAGS = @RANDOMACCESS_FLAGS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TIMECMPR_FLAGS = @TIMECMPR_FLAGS@
+VERSION = @VERSION@
+WRITESTATS_FLAGS = @WRITESTATS_FLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_FC = @ac_ct_FC@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AUTOMAKE_OPTIONS = foreign
+@FORTRAN_FALSE@include_HEADERS = include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+@FORTRAN_FALSE@		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+@FORTRAN_FALSE@		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h\
+@FORTRAN_FALSE@		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+@FORTRAN_FALSE@		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+@FORTRAN_FALSE@		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+@FORTRAN_FALSE@		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+@FORTRAN_FALSE@		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+@FORTRAN_FALSE@		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+@FORTRAN_FALSE@		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h
+
+@FORTRAN_TRUE@include_HEADERS = include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+@FORTRAN_TRUE@		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+@FORTRAN_TRUE@		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h include/szf.h\
+@FORTRAN_TRUE@		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+@FORTRAN_TRUE@		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+@FORTRAN_TRUE@		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+@FORTRAN_TRUE@		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+@FORTRAN_TRUE@		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+@FORTRAN_TRUE@		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+@FORTRAN_TRUE@		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h sz.mod rw.mod
+
+@FORTRAN_FALSE@lib_LTLIBRARIES = libSZ.la
+@FORTRAN_TRUE@lib_LTLIBRARIES = libSZ.la
+@FORTRAN_FALSE@libSZ_la_CFLAGS = -I./include -I../zlib -I../zstd/ \
+@FORTRAN_FALSE@	$(am__append_4) $(am__append_5) $(am__append_6) \
+@FORTRAN_FALSE@	$(am__append_7)
+@FORTRAN_TRUE@libSZ_la_CFLAGS = -I./include -I../zlib/ -I../zstd/ \
+@FORTRAN_TRUE@	$(am__append_1) $(am__append_2) $(am__append_3) \
+@FORTRAN_TRUE@	$(am__append_4) $(am__append_5) $(am__append_6) \
+@FORTRAN_TRUE@	$(am__append_7)
+@FORTRAN_FALSE@libSZ_la_LDFLAGS = -version-info  1:4:0
+@FORTRAN_TRUE@libSZ_la_LDFLAGS = -version-info  2:1:0
+@FORTRAN_FALSE@libSZ_la_LIDADD = ../zlib/.libs/libzlib.a ../zlib/.libs/libzstd.a
+@FORTRAN_TRUE@libSZ_la_LIDADD = ../zlib/.libs/libzlib.a ../zstd/.libs/libzstd.a
+@FORTRAN_FALSE@libSZ_la_SOURCES = src/MultiLevelCacheTable.c \
+@FORTRAN_FALSE@	src/MultiLevelCacheTableWideInterval.c \
+@FORTRAN_FALSE@	src/ByteToolkit.c src/dataCompression.c \
+@FORTRAN_FALSE@	src/DynamicIntArray.c src/iniparser.c \
+@FORTRAN_FALSE@	src/CompressElement.c src/DynamicByteArray.c \
+@FORTRAN_FALSE@	src/rw.c src/utility.c \
+@FORTRAN_FALSE@	src/TightDataPointStorageI.c \
+@FORTRAN_FALSE@	src/TightDataPointStorageD.c \
+@FORTRAN_FALSE@	src/TightDataPointStorageF.c src/conf.c \
+@FORTRAN_FALSE@	src/DynamicDoubleArray.c src/TypeManager.c \
+@FORTRAN_FALSE@	src/dictionary.c src/DynamicFloatArray.c \
+@FORTRAN_FALSE@	src/VarSet.c src/callZlib.c src/Huffman.c \
+@FORTRAN_FALSE@	src/sz_float.c src/sz_double.c src/sz_int8.c \
+@FORTRAN_FALSE@	src/sz_int16.c src/sz_int32.c src/sz_int64.c \
+@FORTRAN_FALSE@	src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c \
+@FORTRAN_FALSE@	src/sz_uint64.c src/szd_uint8.c \
+@FORTRAN_FALSE@	src/szd_uint16.c src/szd_uint32.c \
+@FORTRAN_FALSE@	src/szd_uint64.c src/szd_float.c \
+@FORTRAN_FALSE@	src/szd_double.c src/szd_int8.c src/szd_int16.c \
+@FORTRAN_FALSE@	src/szd_int32.c src/szd_int64.c src/sz.c \
+@FORTRAN_FALSE@	src/sz_float_pwr.c src/sz_double_pwr.c \
+@FORTRAN_FALSE@	src/szd_float_pwr.c src/szd_double_pwr.c \
+@FORTRAN_FALSE@	src/ArithmeticCoding.c src/exafelSZ.c \
+@FORTRAN_FALSE@	src/CacheTable.c $(am__append_8) \
+@FORTRAN_FALSE@	$(am__append_9) $(am__append_10) \
+@FORTRAN_FALSE@	$(am__append_11)
+@FORTRAN_TRUE@libSZ_la_SOURCES = src/MultiLevelCacheTable.c \
+@FORTRAN_TRUE@	src/MultiLevelCacheTableWideInterval.c \
+@FORTRAN_TRUE@	src/ByteToolkit.c src/dataCompression.c \
+@FORTRAN_TRUE@	src/DynamicIntArray.c src/iniparser.c src/szf.c \
+@FORTRAN_TRUE@	src/CompressElement.c src/DynamicByteArray.c \
+@FORTRAN_TRUE@	src/rw.c src/utility.c \
+@FORTRAN_TRUE@	src/TightDataPointStorageI.c \
+@FORTRAN_TRUE@	src/TightDataPointStorageD.c \
+@FORTRAN_TRUE@	src/TightDataPointStorageF.c src/conf.c \
+@FORTRAN_TRUE@	src/DynamicDoubleArray.c src/rwf.c \
+@FORTRAN_TRUE@	src/TypeManager.c src/dictionary.c \
+@FORTRAN_TRUE@	src/DynamicFloatArray.c src/VarSet.c \
+@FORTRAN_TRUE@	src/callZlib.c src/Huffman.c src/sz_float.c \
+@FORTRAN_TRUE@	src/sz_double.c src/sz_int8.c src/sz_int16.c \
+@FORTRAN_TRUE@	src/sz_int32.c src/sz_int64.c src/sz_uint8.c \
+@FORTRAN_TRUE@	src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c \
+@FORTRAN_TRUE@	src/szd_uint8.c src/szd_uint16.c \
+@FORTRAN_TRUE@	src/szd_uint32.c src/szd_uint64.c \
+@FORTRAN_TRUE@	src/szd_float.c src/szd_double.c src/szd_int8.c \
+@FORTRAN_TRUE@	src/szd_int16.c src/szd_int32.c src/szd_int64.c \
+@FORTRAN_TRUE@	src/sz.c src/sz_float_pwr.c src/sz_double_pwr.c \
+@FORTRAN_TRUE@	src/szd_float_pwr.c src/szd_double_pwr.c \
+@FORTRAN_TRUE@	src/ArithmeticCoding.c src/CacheTable.c \
+@FORTRAN_TRUE@	src/sz_interface.F90 src/rw_interface.F90 \
+@FORTRAN_TRUE@	src/exafelSZ.c $(am__append_8) $(am__append_9) \
+@FORTRAN_TRUE@	$(am__append_10) $(am__append_11)
+@FORTRAN_FALSE@libSZ_la_LINK = $(AM_V_CC)$(LIBTOOL) --tag=CC --mode=link $(CCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+@FORTRAN_TRUE@libSZ_la_LINK = $(AM_V_CC)$(LIBTOOL) --tag=FC --mode=link $(FCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .F90 .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign sz/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign sz/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+src/$(am__dirstamp):
+	@$(MKDIR_P) src
+	@: > src/$(am__dirstamp)
+src/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/$(DEPDIR)
+	@: > src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-MultiLevelCacheTable.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-MultiLevelCacheTableWideInterval.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-ByteToolkit.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-dataCompression.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-DynamicIntArray.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-iniparser.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-CompressElement.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-DynamicByteArray.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-rw.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-utility.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-TightDataPointStorageI.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-TightDataPointStorageD.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-TightDataPointStorageF.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-conf.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-DynamicDoubleArray.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-TypeManager.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-dictionary.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-DynamicFloatArray.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-VarSet.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-callZlib.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-Huffman.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_float.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_double.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_int8.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_int16.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_int32.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_int64.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_uint8.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_uint16.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_uint32.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_uint64.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_uint8.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_uint16.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_uint32.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_uint64.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_float.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_double.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_int8.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_int16.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_int32.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_int64.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_float_pwr.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_double_pwr.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_float_pwr.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_double_pwr.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-ArithmeticCoding.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-exafelSZ.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-CacheTable.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-pastri.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_omp.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_float_ts.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_float_ts.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_double_ts.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szd_double_ts.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-sz_stats.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-szf.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/libSZ_la-rwf.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/sz_interface.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/rw_interface.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+
+libSZ.la: $(libSZ_la_OBJECTS) $(libSZ_la_DEPENDENCIES) $(EXTRA_libSZ_la_DEPENDENCIES) 
+	$(AM_V_GEN)$(libSZ_la_LINK) $(am_libSZ_la_rpath) $(libSZ_la_OBJECTS) $(libSZ_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+	-rm -f src/*.$(OBJEXT)
+	-rm -f src/*.lo
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-ByteToolkit.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-CacheTable.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-CompressElement.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-DynamicByteArray.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-DynamicIntArray.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-Huffman.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-TypeManager.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-VarSet.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-callZlib.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-conf.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-dataCompression.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-dictionary.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-exafelSZ.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-iniparser.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-pastri.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-rw.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-rwf.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_double.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_double_pwr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_double_ts.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_float.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_float_pwr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_float_ts.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_int16.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_int32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_int64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_int8.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_omp.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_stats.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_uint16.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_uint32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_uint64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-sz_uint8.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_double.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_double_pwr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_double_ts.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_float.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_float_pwr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_float_ts.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_int16.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_int32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_int64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_int8.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_uint16.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_uint32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_uint64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szd_uint8.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-szf.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/libSZ_la-utility.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.F90.o:
+	$(AM_V_PPFC)$(PPFCCOMPILE) -c -o $@ $<
+
+.F90.obj:
+	$(AM_V_PPFC)$(PPFCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.F90.lo:
+	$(AM_V_PPFC)$(LTPPFCCOMPILE) -c -o $@ $<
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+src/libSZ_la-MultiLevelCacheTable.lo: src/MultiLevelCacheTable.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-MultiLevelCacheTable.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Tpo -c -o src/libSZ_la-MultiLevelCacheTable.lo `test -f 'src/MultiLevelCacheTable.c' || echo '$(srcdir)/'`src/MultiLevelCacheTable.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Tpo src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/MultiLevelCacheTable.c' object='src/libSZ_la-MultiLevelCacheTable.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-MultiLevelCacheTable.lo `test -f 'src/MultiLevelCacheTable.c' || echo '$(srcdir)/'`src/MultiLevelCacheTable.c
+
+src/libSZ_la-MultiLevelCacheTableWideInterval.lo: src/MultiLevelCacheTableWideInterval.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-MultiLevelCacheTableWideInterval.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Tpo -c -o src/libSZ_la-MultiLevelCacheTableWideInterval.lo `test -f 'src/MultiLevelCacheTableWideInterval.c' || echo '$(srcdir)/'`src/MultiLevelCacheTableWideInterval.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Tpo src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/MultiLevelCacheTableWideInterval.c' object='src/libSZ_la-MultiLevelCacheTableWideInterval.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-MultiLevelCacheTableWideInterval.lo `test -f 'src/MultiLevelCacheTableWideInterval.c' || echo '$(srcdir)/'`src/MultiLevelCacheTableWideInterval.c
+
+src/libSZ_la-ByteToolkit.lo: src/ByteToolkit.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-ByteToolkit.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-ByteToolkit.Tpo -c -o src/libSZ_la-ByteToolkit.lo `test -f 'src/ByteToolkit.c' || echo '$(srcdir)/'`src/ByteToolkit.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-ByteToolkit.Tpo src/$(DEPDIR)/libSZ_la-ByteToolkit.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/ByteToolkit.c' object='src/libSZ_la-ByteToolkit.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-ByteToolkit.lo `test -f 'src/ByteToolkit.c' || echo '$(srcdir)/'`src/ByteToolkit.c
+
+src/libSZ_la-dataCompression.lo: src/dataCompression.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-dataCompression.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-dataCompression.Tpo -c -o src/libSZ_la-dataCompression.lo `test -f 'src/dataCompression.c' || echo '$(srcdir)/'`src/dataCompression.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-dataCompression.Tpo src/$(DEPDIR)/libSZ_la-dataCompression.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/dataCompression.c' object='src/libSZ_la-dataCompression.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-dataCompression.lo `test -f 'src/dataCompression.c' || echo '$(srcdir)/'`src/dataCompression.c
+
+src/libSZ_la-DynamicIntArray.lo: src/DynamicIntArray.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-DynamicIntArray.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-DynamicIntArray.Tpo -c -o src/libSZ_la-DynamicIntArray.lo `test -f 'src/DynamicIntArray.c' || echo '$(srcdir)/'`src/DynamicIntArray.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-DynamicIntArray.Tpo src/$(DEPDIR)/libSZ_la-DynamicIntArray.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/DynamicIntArray.c' object='src/libSZ_la-DynamicIntArray.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-DynamicIntArray.lo `test -f 'src/DynamicIntArray.c' || echo '$(srcdir)/'`src/DynamicIntArray.c
+
+src/libSZ_la-iniparser.lo: src/iniparser.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-iniparser.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-iniparser.Tpo -c -o src/libSZ_la-iniparser.lo `test -f 'src/iniparser.c' || echo '$(srcdir)/'`src/iniparser.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-iniparser.Tpo src/$(DEPDIR)/libSZ_la-iniparser.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/iniparser.c' object='src/libSZ_la-iniparser.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-iniparser.lo `test -f 'src/iniparser.c' || echo '$(srcdir)/'`src/iniparser.c
+
+src/libSZ_la-CompressElement.lo: src/CompressElement.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-CompressElement.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-CompressElement.Tpo -c -o src/libSZ_la-CompressElement.lo `test -f 'src/CompressElement.c' || echo '$(srcdir)/'`src/CompressElement.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-CompressElement.Tpo src/$(DEPDIR)/libSZ_la-CompressElement.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/CompressElement.c' object='src/libSZ_la-CompressElement.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-CompressElement.lo `test -f 'src/CompressElement.c' || echo '$(srcdir)/'`src/CompressElement.c
+
+src/libSZ_la-DynamicByteArray.lo: src/DynamicByteArray.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-DynamicByteArray.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-DynamicByteArray.Tpo -c -o src/libSZ_la-DynamicByteArray.lo `test -f 'src/DynamicByteArray.c' || echo '$(srcdir)/'`src/DynamicByteArray.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-DynamicByteArray.Tpo src/$(DEPDIR)/libSZ_la-DynamicByteArray.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/DynamicByteArray.c' object='src/libSZ_la-DynamicByteArray.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-DynamicByteArray.lo `test -f 'src/DynamicByteArray.c' || echo '$(srcdir)/'`src/DynamicByteArray.c
+
+src/libSZ_la-rw.lo: src/rw.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-rw.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-rw.Tpo -c -o src/libSZ_la-rw.lo `test -f 'src/rw.c' || echo '$(srcdir)/'`src/rw.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-rw.Tpo src/$(DEPDIR)/libSZ_la-rw.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/rw.c' object='src/libSZ_la-rw.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-rw.lo `test -f 'src/rw.c' || echo '$(srcdir)/'`src/rw.c
+
+src/libSZ_la-utility.lo: src/utility.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-utility.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-utility.Tpo -c -o src/libSZ_la-utility.lo `test -f 'src/utility.c' || echo '$(srcdir)/'`src/utility.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-utility.Tpo src/$(DEPDIR)/libSZ_la-utility.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/utility.c' object='src/libSZ_la-utility.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-utility.lo `test -f 'src/utility.c' || echo '$(srcdir)/'`src/utility.c
+
+src/libSZ_la-TightDataPointStorageI.lo: src/TightDataPointStorageI.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-TightDataPointStorageI.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Tpo -c -o src/libSZ_la-TightDataPointStorageI.lo `test -f 'src/TightDataPointStorageI.c' || echo '$(srcdir)/'`src/TightDataPointStorageI.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Tpo src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/TightDataPointStorageI.c' object='src/libSZ_la-TightDataPointStorageI.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-TightDataPointStorageI.lo `test -f 'src/TightDataPointStorageI.c' || echo '$(srcdir)/'`src/TightDataPointStorageI.c
+
+src/libSZ_la-TightDataPointStorageD.lo: src/TightDataPointStorageD.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-TightDataPointStorageD.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Tpo -c -o src/libSZ_la-TightDataPointStorageD.lo `test -f 'src/TightDataPointStorageD.c' || echo '$(srcdir)/'`src/TightDataPointStorageD.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Tpo src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/TightDataPointStorageD.c' object='src/libSZ_la-TightDataPointStorageD.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-TightDataPointStorageD.lo `test -f 'src/TightDataPointStorageD.c' || echo '$(srcdir)/'`src/TightDataPointStorageD.c
+
+src/libSZ_la-TightDataPointStorageF.lo: src/TightDataPointStorageF.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-TightDataPointStorageF.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Tpo -c -o src/libSZ_la-TightDataPointStorageF.lo `test -f 'src/TightDataPointStorageF.c' || echo '$(srcdir)/'`src/TightDataPointStorageF.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Tpo src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/TightDataPointStorageF.c' object='src/libSZ_la-TightDataPointStorageF.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-TightDataPointStorageF.lo `test -f 'src/TightDataPointStorageF.c' || echo '$(srcdir)/'`src/TightDataPointStorageF.c
+
+src/libSZ_la-conf.lo: src/conf.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-conf.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-conf.Tpo -c -o src/libSZ_la-conf.lo `test -f 'src/conf.c' || echo '$(srcdir)/'`src/conf.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-conf.Tpo src/$(DEPDIR)/libSZ_la-conf.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/conf.c' object='src/libSZ_la-conf.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-conf.lo `test -f 'src/conf.c' || echo '$(srcdir)/'`src/conf.c
+
+src/libSZ_la-DynamicDoubleArray.lo: src/DynamicDoubleArray.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-DynamicDoubleArray.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Tpo -c -o src/libSZ_la-DynamicDoubleArray.lo `test -f 'src/DynamicDoubleArray.c' || echo '$(srcdir)/'`src/DynamicDoubleArray.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Tpo src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/DynamicDoubleArray.c' object='src/libSZ_la-DynamicDoubleArray.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-DynamicDoubleArray.lo `test -f 'src/DynamicDoubleArray.c' || echo '$(srcdir)/'`src/DynamicDoubleArray.c
+
+src/libSZ_la-TypeManager.lo: src/TypeManager.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-TypeManager.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-TypeManager.Tpo -c -o src/libSZ_la-TypeManager.lo `test -f 'src/TypeManager.c' || echo '$(srcdir)/'`src/TypeManager.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-TypeManager.Tpo src/$(DEPDIR)/libSZ_la-TypeManager.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/TypeManager.c' object='src/libSZ_la-TypeManager.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-TypeManager.lo `test -f 'src/TypeManager.c' || echo '$(srcdir)/'`src/TypeManager.c
+
+src/libSZ_la-dictionary.lo: src/dictionary.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-dictionary.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-dictionary.Tpo -c -o src/libSZ_la-dictionary.lo `test -f 'src/dictionary.c' || echo '$(srcdir)/'`src/dictionary.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-dictionary.Tpo src/$(DEPDIR)/libSZ_la-dictionary.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/dictionary.c' object='src/libSZ_la-dictionary.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-dictionary.lo `test -f 'src/dictionary.c' || echo '$(srcdir)/'`src/dictionary.c
+
+src/libSZ_la-DynamicFloatArray.lo: src/DynamicFloatArray.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-DynamicFloatArray.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Tpo -c -o src/libSZ_la-DynamicFloatArray.lo `test -f 'src/DynamicFloatArray.c' || echo '$(srcdir)/'`src/DynamicFloatArray.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Tpo src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/DynamicFloatArray.c' object='src/libSZ_la-DynamicFloatArray.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-DynamicFloatArray.lo `test -f 'src/DynamicFloatArray.c' || echo '$(srcdir)/'`src/DynamicFloatArray.c
+
+src/libSZ_la-VarSet.lo: src/VarSet.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-VarSet.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-VarSet.Tpo -c -o src/libSZ_la-VarSet.lo `test -f 'src/VarSet.c' || echo '$(srcdir)/'`src/VarSet.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-VarSet.Tpo src/$(DEPDIR)/libSZ_la-VarSet.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/VarSet.c' object='src/libSZ_la-VarSet.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-VarSet.lo `test -f 'src/VarSet.c' || echo '$(srcdir)/'`src/VarSet.c
+
+src/libSZ_la-callZlib.lo: src/callZlib.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-callZlib.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-callZlib.Tpo -c -o src/libSZ_la-callZlib.lo `test -f 'src/callZlib.c' || echo '$(srcdir)/'`src/callZlib.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-callZlib.Tpo src/$(DEPDIR)/libSZ_la-callZlib.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/callZlib.c' object='src/libSZ_la-callZlib.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-callZlib.lo `test -f 'src/callZlib.c' || echo '$(srcdir)/'`src/callZlib.c
+
+src/libSZ_la-Huffman.lo: src/Huffman.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-Huffman.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-Huffman.Tpo -c -o src/libSZ_la-Huffman.lo `test -f 'src/Huffman.c' || echo '$(srcdir)/'`src/Huffman.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-Huffman.Tpo src/$(DEPDIR)/libSZ_la-Huffman.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/Huffman.c' object='src/libSZ_la-Huffman.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-Huffman.lo `test -f 'src/Huffman.c' || echo '$(srcdir)/'`src/Huffman.c
+
+src/libSZ_la-sz_float.lo: src/sz_float.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_float.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_float.Tpo -c -o src/libSZ_la-sz_float.lo `test -f 'src/sz_float.c' || echo '$(srcdir)/'`src/sz_float.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_float.Tpo src/$(DEPDIR)/libSZ_la-sz_float.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_float.c' object='src/libSZ_la-sz_float.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_float.lo `test -f 'src/sz_float.c' || echo '$(srcdir)/'`src/sz_float.c
+
+src/libSZ_la-sz_double.lo: src/sz_double.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_double.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_double.Tpo -c -o src/libSZ_la-sz_double.lo `test -f 'src/sz_double.c' || echo '$(srcdir)/'`src/sz_double.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_double.Tpo src/$(DEPDIR)/libSZ_la-sz_double.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_double.c' object='src/libSZ_la-sz_double.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_double.lo `test -f 'src/sz_double.c' || echo '$(srcdir)/'`src/sz_double.c
+
+src/libSZ_la-sz_int8.lo: src/sz_int8.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_int8.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_int8.Tpo -c -o src/libSZ_la-sz_int8.lo `test -f 'src/sz_int8.c' || echo '$(srcdir)/'`src/sz_int8.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_int8.Tpo src/$(DEPDIR)/libSZ_la-sz_int8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_int8.c' object='src/libSZ_la-sz_int8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_int8.lo `test -f 'src/sz_int8.c' || echo '$(srcdir)/'`src/sz_int8.c
+
+src/libSZ_la-sz_int16.lo: src/sz_int16.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_int16.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_int16.Tpo -c -o src/libSZ_la-sz_int16.lo `test -f 'src/sz_int16.c' || echo '$(srcdir)/'`src/sz_int16.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_int16.Tpo src/$(DEPDIR)/libSZ_la-sz_int16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_int16.c' object='src/libSZ_la-sz_int16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_int16.lo `test -f 'src/sz_int16.c' || echo '$(srcdir)/'`src/sz_int16.c
+
+src/libSZ_la-sz_int32.lo: src/sz_int32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_int32.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_int32.Tpo -c -o src/libSZ_la-sz_int32.lo `test -f 'src/sz_int32.c' || echo '$(srcdir)/'`src/sz_int32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_int32.Tpo src/$(DEPDIR)/libSZ_la-sz_int32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_int32.c' object='src/libSZ_la-sz_int32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_int32.lo `test -f 'src/sz_int32.c' || echo '$(srcdir)/'`src/sz_int32.c
+
+src/libSZ_la-sz_int64.lo: src/sz_int64.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_int64.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_int64.Tpo -c -o src/libSZ_la-sz_int64.lo `test -f 'src/sz_int64.c' || echo '$(srcdir)/'`src/sz_int64.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_int64.Tpo src/$(DEPDIR)/libSZ_la-sz_int64.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_int64.c' object='src/libSZ_la-sz_int64.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_int64.lo `test -f 'src/sz_int64.c' || echo '$(srcdir)/'`src/sz_int64.c
+
+src/libSZ_la-sz_uint8.lo: src/sz_uint8.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_uint8.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_uint8.Tpo -c -o src/libSZ_la-sz_uint8.lo `test -f 'src/sz_uint8.c' || echo '$(srcdir)/'`src/sz_uint8.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_uint8.Tpo src/$(DEPDIR)/libSZ_la-sz_uint8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_uint8.c' object='src/libSZ_la-sz_uint8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_uint8.lo `test -f 'src/sz_uint8.c' || echo '$(srcdir)/'`src/sz_uint8.c
+
+src/libSZ_la-sz_uint16.lo: src/sz_uint16.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_uint16.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_uint16.Tpo -c -o src/libSZ_la-sz_uint16.lo `test -f 'src/sz_uint16.c' || echo '$(srcdir)/'`src/sz_uint16.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_uint16.Tpo src/$(DEPDIR)/libSZ_la-sz_uint16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_uint16.c' object='src/libSZ_la-sz_uint16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_uint16.lo `test -f 'src/sz_uint16.c' || echo '$(srcdir)/'`src/sz_uint16.c
+
+src/libSZ_la-sz_uint32.lo: src/sz_uint32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_uint32.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_uint32.Tpo -c -o src/libSZ_la-sz_uint32.lo `test -f 'src/sz_uint32.c' || echo '$(srcdir)/'`src/sz_uint32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_uint32.Tpo src/$(DEPDIR)/libSZ_la-sz_uint32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_uint32.c' object='src/libSZ_la-sz_uint32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_uint32.lo `test -f 'src/sz_uint32.c' || echo '$(srcdir)/'`src/sz_uint32.c
+
+src/libSZ_la-sz_uint64.lo: src/sz_uint64.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_uint64.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_uint64.Tpo -c -o src/libSZ_la-sz_uint64.lo `test -f 'src/sz_uint64.c' || echo '$(srcdir)/'`src/sz_uint64.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_uint64.Tpo src/$(DEPDIR)/libSZ_la-sz_uint64.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_uint64.c' object='src/libSZ_la-sz_uint64.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_uint64.lo `test -f 'src/sz_uint64.c' || echo '$(srcdir)/'`src/sz_uint64.c
+
+src/libSZ_la-szd_uint8.lo: src/szd_uint8.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_uint8.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_uint8.Tpo -c -o src/libSZ_la-szd_uint8.lo `test -f 'src/szd_uint8.c' || echo '$(srcdir)/'`src/szd_uint8.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_uint8.Tpo src/$(DEPDIR)/libSZ_la-szd_uint8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_uint8.c' object='src/libSZ_la-szd_uint8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_uint8.lo `test -f 'src/szd_uint8.c' || echo '$(srcdir)/'`src/szd_uint8.c
+
+src/libSZ_la-szd_uint16.lo: src/szd_uint16.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_uint16.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_uint16.Tpo -c -o src/libSZ_la-szd_uint16.lo `test -f 'src/szd_uint16.c' || echo '$(srcdir)/'`src/szd_uint16.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_uint16.Tpo src/$(DEPDIR)/libSZ_la-szd_uint16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_uint16.c' object='src/libSZ_la-szd_uint16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_uint16.lo `test -f 'src/szd_uint16.c' || echo '$(srcdir)/'`src/szd_uint16.c
+
+src/libSZ_la-szd_uint32.lo: src/szd_uint32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_uint32.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_uint32.Tpo -c -o src/libSZ_la-szd_uint32.lo `test -f 'src/szd_uint32.c' || echo '$(srcdir)/'`src/szd_uint32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_uint32.Tpo src/$(DEPDIR)/libSZ_la-szd_uint32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_uint32.c' object='src/libSZ_la-szd_uint32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_uint32.lo `test -f 'src/szd_uint32.c' || echo '$(srcdir)/'`src/szd_uint32.c
+
+src/libSZ_la-szd_uint64.lo: src/szd_uint64.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_uint64.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_uint64.Tpo -c -o src/libSZ_la-szd_uint64.lo `test -f 'src/szd_uint64.c' || echo '$(srcdir)/'`src/szd_uint64.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_uint64.Tpo src/$(DEPDIR)/libSZ_la-szd_uint64.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_uint64.c' object='src/libSZ_la-szd_uint64.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_uint64.lo `test -f 'src/szd_uint64.c' || echo '$(srcdir)/'`src/szd_uint64.c
+
+src/libSZ_la-szd_float.lo: src/szd_float.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_float.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_float.Tpo -c -o src/libSZ_la-szd_float.lo `test -f 'src/szd_float.c' || echo '$(srcdir)/'`src/szd_float.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_float.Tpo src/$(DEPDIR)/libSZ_la-szd_float.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_float.c' object='src/libSZ_la-szd_float.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_float.lo `test -f 'src/szd_float.c' || echo '$(srcdir)/'`src/szd_float.c
+
+src/libSZ_la-szd_double.lo: src/szd_double.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_double.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_double.Tpo -c -o src/libSZ_la-szd_double.lo `test -f 'src/szd_double.c' || echo '$(srcdir)/'`src/szd_double.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_double.Tpo src/$(DEPDIR)/libSZ_la-szd_double.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_double.c' object='src/libSZ_la-szd_double.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_double.lo `test -f 'src/szd_double.c' || echo '$(srcdir)/'`src/szd_double.c
+
+src/libSZ_la-szd_int8.lo: src/szd_int8.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_int8.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_int8.Tpo -c -o src/libSZ_la-szd_int8.lo `test -f 'src/szd_int8.c' || echo '$(srcdir)/'`src/szd_int8.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_int8.Tpo src/$(DEPDIR)/libSZ_la-szd_int8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_int8.c' object='src/libSZ_la-szd_int8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_int8.lo `test -f 'src/szd_int8.c' || echo '$(srcdir)/'`src/szd_int8.c
+
+src/libSZ_la-szd_int16.lo: src/szd_int16.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_int16.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_int16.Tpo -c -o src/libSZ_la-szd_int16.lo `test -f 'src/szd_int16.c' || echo '$(srcdir)/'`src/szd_int16.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_int16.Tpo src/$(DEPDIR)/libSZ_la-szd_int16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_int16.c' object='src/libSZ_la-szd_int16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_int16.lo `test -f 'src/szd_int16.c' || echo '$(srcdir)/'`src/szd_int16.c
+
+src/libSZ_la-szd_int32.lo: src/szd_int32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_int32.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_int32.Tpo -c -o src/libSZ_la-szd_int32.lo `test -f 'src/szd_int32.c' || echo '$(srcdir)/'`src/szd_int32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_int32.Tpo src/$(DEPDIR)/libSZ_la-szd_int32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_int32.c' object='src/libSZ_la-szd_int32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_int32.lo `test -f 'src/szd_int32.c' || echo '$(srcdir)/'`src/szd_int32.c
+
+src/libSZ_la-szd_int64.lo: src/szd_int64.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_int64.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_int64.Tpo -c -o src/libSZ_la-szd_int64.lo `test -f 'src/szd_int64.c' || echo '$(srcdir)/'`src/szd_int64.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_int64.Tpo src/$(DEPDIR)/libSZ_la-szd_int64.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_int64.c' object='src/libSZ_la-szd_int64.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_int64.lo `test -f 'src/szd_int64.c' || echo '$(srcdir)/'`src/szd_int64.c
+
+src/libSZ_la-sz.lo: src/sz.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz.Tpo -c -o src/libSZ_la-sz.lo `test -f 'src/sz.c' || echo '$(srcdir)/'`src/sz.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz.Tpo src/$(DEPDIR)/libSZ_la-sz.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz.c' object='src/libSZ_la-sz.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz.lo `test -f 'src/sz.c' || echo '$(srcdir)/'`src/sz.c
+
+src/libSZ_la-sz_float_pwr.lo: src/sz_float_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_float_pwr.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_float_pwr.Tpo -c -o src/libSZ_la-sz_float_pwr.lo `test -f 'src/sz_float_pwr.c' || echo '$(srcdir)/'`src/sz_float_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_float_pwr.Tpo src/$(DEPDIR)/libSZ_la-sz_float_pwr.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_float_pwr.c' object='src/libSZ_la-sz_float_pwr.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_float_pwr.lo `test -f 'src/sz_float_pwr.c' || echo '$(srcdir)/'`src/sz_float_pwr.c
+
+src/libSZ_la-sz_double_pwr.lo: src/sz_double_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_double_pwr.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_double_pwr.Tpo -c -o src/libSZ_la-sz_double_pwr.lo `test -f 'src/sz_double_pwr.c' || echo '$(srcdir)/'`src/sz_double_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_double_pwr.Tpo src/$(DEPDIR)/libSZ_la-sz_double_pwr.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_double_pwr.c' object='src/libSZ_la-sz_double_pwr.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_double_pwr.lo `test -f 'src/sz_double_pwr.c' || echo '$(srcdir)/'`src/sz_double_pwr.c
+
+src/libSZ_la-szd_float_pwr.lo: src/szd_float_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_float_pwr.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_float_pwr.Tpo -c -o src/libSZ_la-szd_float_pwr.lo `test -f 'src/szd_float_pwr.c' || echo '$(srcdir)/'`src/szd_float_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_float_pwr.Tpo src/$(DEPDIR)/libSZ_la-szd_float_pwr.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_float_pwr.c' object='src/libSZ_la-szd_float_pwr.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_float_pwr.lo `test -f 'src/szd_float_pwr.c' || echo '$(srcdir)/'`src/szd_float_pwr.c
+
+src/libSZ_la-szd_double_pwr.lo: src/szd_double_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_double_pwr.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_double_pwr.Tpo -c -o src/libSZ_la-szd_double_pwr.lo `test -f 'src/szd_double_pwr.c' || echo '$(srcdir)/'`src/szd_double_pwr.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_double_pwr.Tpo src/$(DEPDIR)/libSZ_la-szd_double_pwr.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_double_pwr.c' object='src/libSZ_la-szd_double_pwr.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_double_pwr.lo `test -f 'src/szd_double_pwr.c' || echo '$(srcdir)/'`src/szd_double_pwr.c
+
+src/libSZ_la-ArithmeticCoding.lo: src/ArithmeticCoding.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-ArithmeticCoding.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Tpo -c -o src/libSZ_la-ArithmeticCoding.lo `test -f 'src/ArithmeticCoding.c' || echo '$(srcdir)/'`src/ArithmeticCoding.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Tpo src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/ArithmeticCoding.c' object='src/libSZ_la-ArithmeticCoding.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-ArithmeticCoding.lo `test -f 'src/ArithmeticCoding.c' || echo '$(srcdir)/'`src/ArithmeticCoding.c
+
+src/libSZ_la-exafelSZ.lo: src/exafelSZ.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-exafelSZ.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-exafelSZ.Tpo -c -o src/libSZ_la-exafelSZ.lo `test -f 'src/exafelSZ.c' || echo '$(srcdir)/'`src/exafelSZ.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-exafelSZ.Tpo src/$(DEPDIR)/libSZ_la-exafelSZ.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/exafelSZ.c' object='src/libSZ_la-exafelSZ.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-exafelSZ.lo `test -f 'src/exafelSZ.c' || echo '$(srcdir)/'`src/exafelSZ.c
+
+src/libSZ_la-CacheTable.lo: src/CacheTable.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-CacheTable.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-CacheTable.Tpo -c -o src/libSZ_la-CacheTable.lo `test -f 'src/CacheTable.c' || echo '$(srcdir)/'`src/CacheTable.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-CacheTable.Tpo src/$(DEPDIR)/libSZ_la-CacheTable.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/CacheTable.c' object='src/libSZ_la-CacheTable.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-CacheTable.lo `test -f 'src/CacheTable.c' || echo '$(srcdir)/'`src/CacheTable.c
+
+src/libSZ_la-pastri.lo: src/pastri.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-pastri.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-pastri.Tpo -c -o src/libSZ_la-pastri.lo `test -f 'src/pastri.c' || echo '$(srcdir)/'`src/pastri.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-pastri.Tpo src/$(DEPDIR)/libSZ_la-pastri.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/pastri.c' object='src/libSZ_la-pastri.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-pastri.lo `test -f 'src/pastri.c' || echo '$(srcdir)/'`src/pastri.c
+
+src/libSZ_la-sz_omp.lo: src/sz_omp.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_omp.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_omp.Tpo -c -o src/libSZ_la-sz_omp.lo `test -f 'src/sz_omp.c' || echo '$(srcdir)/'`src/sz_omp.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_omp.Tpo src/$(DEPDIR)/libSZ_la-sz_omp.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_omp.c' object='src/libSZ_la-sz_omp.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_omp.lo `test -f 'src/sz_omp.c' || echo '$(srcdir)/'`src/sz_omp.c
+
+src/libSZ_la-sz_float_ts.lo: src/sz_float_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_float_ts.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_float_ts.Tpo -c -o src/libSZ_la-sz_float_ts.lo `test -f 'src/sz_float_ts.c' || echo '$(srcdir)/'`src/sz_float_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_float_ts.Tpo src/$(DEPDIR)/libSZ_la-sz_float_ts.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_float_ts.c' object='src/libSZ_la-sz_float_ts.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_float_ts.lo `test -f 'src/sz_float_ts.c' || echo '$(srcdir)/'`src/sz_float_ts.c
+
+src/libSZ_la-szd_float_ts.lo: src/szd_float_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_float_ts.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_float_ts.Tpo -c -o src/libSZ_la-szd_float_ts.lo `test -f 'src/szd_float_ts.c' || echo '$(srcdir)/'`src/szd_float_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_float_ts.Tpo src/$(DEPDIR)/libSZ_la-szd_float_ts.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_float_ts.c' object='src/libSZ_la-szd_float_ts.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_float_ts.lo `test -f 'src/szd_float_ts.c' || echo '$(srcdir)/'`src/szd_float_ts.c
+
+src/libSZ_la-sz_double_ts.lo: src/sz_double_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_double_ts.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_double_ts.Tpo -c -o src/libSZ_la-sz_double_ts.lo `test -f 'src/sz_double_ts.c' || echo '$(srcdir)/'`src/sz_double_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_double_ts.Tpo src/$(DEPDIR)/libSZ_la-sz_double_ts.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_double_ts.c' object='src/libSZ_la-sz_double_ts.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_double_ts.lo `test -f 'src/sz_double_ts.c' || echo '$(srcdir)/'`src/sz_double_ts.c
+
+src/libSZ_la-szd_double_ts.lo: src/szd_double_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szd_double_ts.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szd_double_ts.Tpo -c -o src/libSZ_la-szd_double_ts.lo `test -f 'src/szd_double_ts.c' || echo '$(srcdir)/'`src/szd_double_ts.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szd_double_ts.Tpo src/$(DEPDIR)/libSZ_la-szd_double_ts.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szd_double_ts.c' object='src/libSZ_la-szd_double_ts.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szd_double_ts.lo `test -f 'src/szd_double_ts.c' || echo '$(srcdir)/'`src/szd_double_ts.c
+
+src/libSZ_la-sz_stats.lo: src/sz_stats.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-sz_stats.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-sz_stats.Tpo -c -o src/libSZ_la-sz_stats.lo `test -f 'src/sz_stats.c' || echo '$(srcdir)/'`src/sz_stats.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-sz_stats.Tpo src/$(DEPDIR)/libSZ_la-sz_stats.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/sz_stats.c' object='src/libSZ_la-sz_stats.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-sz_stats.lo `test -f 'src/sz_stats.c' || echo '$(srcdir)/'`src/sz_stats.c
+
+src/libSZ_la-szf.lo: src/szf.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-szf.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-szf.Tpo -c -o src/libSZ_la-szf.lo `test -f 'src/szf.c' || echo '$(srcdir)/'`src/szf.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-szf.Tpo src/$(DEPDIR)/libSZ_la-szf.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/szf.c' object='src/libSZ_la-szf.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-szf.lo `test -f 'src/szf.c' || echo '$(srcdir)/'`src/szf.c
+
+src/libSZ_la-rwf.lo: src/rwf.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -MT src/libSZ_la-rwf.lo -MD -MP -MF src/$(DEPDIR)/libSZ_la-rwf.Tpo -c -o src/libSZ_la-rwf.lo `test -f 'src/rwf.c' || echo '$(srcdir)/'`src/rwf.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) src/$(DEPDIR)/libSZ_la-rwf.Tpo src/$(DEPDIR)/libSZ_la-rwf.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='src/rwf.c' object='src/libSZ_la-rwf.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libSZ_la_CFLAGS) $(CFLAGS) -c -o src/libSZ_la-rwf.lo `test -f 'src/rwf.c' || echo '$(srcdir)/'`src/rwf.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+	-rm -rf src/.libs src/_libs
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-rm -f src/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/$(am__dirstamp)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-ByteToolkit.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-CacheTable.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-CompressElement.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicByteArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicIntArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-Huffman.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TypeManager.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-VarSet.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-callZlib.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-conf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-dataCompression.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-dictionary.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-exafelSZ.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-iniparser.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-pastri.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-rw.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-rwf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_omp.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_stats.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-utility.Plo
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f src/$(DEPDIR)/libSZ_la-ArithmeticCoding.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-ByteToolkit.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-CacheTable.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-CompressElement.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicByteArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicDoubleArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicFloatArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-DynamicIntArray.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-Huffman.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-MultiLevelCacheTable.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-MultiLevelCacheTableWideInterval.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageD.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageF.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TightDataPointStorageI.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-TypeManager.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-VarSet.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-callZlib.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-conf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-dataCompression.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-dictionary.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-exafelSZ.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-iniparser.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-pastri.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-rw.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-rwf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_double_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_float_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_int8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_omp.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_stats.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-sz_uint8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_double_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float_pwr.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_float_ts.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_int8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint16.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint32.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint64.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szd_uint8.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-szf.Plo
+	-rm -f src/$(DEPDIR)/libSZ_la-utility.Plo
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-libLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libLTLIBRARIES clean-libtool cscopelist-am \
+	ctags ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am \
+	install-includeHEADERS install-info install-info-am \
+	install-libLTLIBRARIES install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am uninstall-includeHEADERS \
+	uninstall-libLTLIBRARIES
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/deps/SZ/sz/include/ArithmeticCoding.h b/deps/SZ/sz/include/ArithmeticCoding.h
new file mode 100644
index 0000000000000000000000000000000000000000..010a2513215982e239cb0b421697985fd67f6a71
--- /dev/null
+++ b/deps/SZ/sz/include/ArithmeticCoding.h
@@ -0,0 +1,62 @@
+/**
+ *  @file ArithmeticCoding.h
+ *  @author Sheng Di
+ *  @date Dec, 2018
+ *  @brief Header file for the ArithmeticCoding.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _ArithmeticCoding_H
+#define _ArithmeticCoding_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+#define ONE_FOURTH (0x40000000000) //44 bits are absolutely enough to deal with a large dataset (support at most 16TB per process)
+#define ONE_HALF (0x80000000000)
+#define THREE_FOURTHS (0xC0000000000)
+#define MAX_CODE (0xFFFFFFFFFFF)
+#define MAX_INTERVALS 1048576 //the limit to the arithmetic coding (at most 2^(20) intervals)
+
+typedef struct Prob {
+    size_t low;
+    size_t high;
+    int state;
+} Prob;
+
+typedef struct AriCoder
+{
+	int numOfRealStates; //the # real states menas the number of states after the optimization of # intervals
+	int numOfValidStates; //the # valid states means the number of non-zero frequency cells (some states/codes actually didn't appear)
+	size_t total_frequency;	
+	Prob* cumulative_frequency; //used to encode data more efficiencly
+} AriCoder;
+
+void output_bit_1(unsigned int* buf);
+void output_bit_0(unsigned int* buf);
+unsigned int output_bit_1_plus_pending(int pending_bits);
+unsigned int output_bit_0_plus_pending(int pending_bits);
+
+AriCoder *createAriCoder(int numOfStates, int *s, size_t length);
+void freeAriCoder(AriCoder *ariCoder);
+void ari_init(AriCoder *ariCoder, int *s, size_t length);
+unsigned int pad_ariCoder(AriCoder* ariCoder, unsigned char** out);
+int unpad_ariCoder(AriCoder** ariCoder, unsigned char* bytes);
+
+unsigned char get_bit(unsigned char* p, int offset);
+
+void ari_encode(AriCoder *ariCoder, int *s, size_t length, unsigned char *out, size_t *outSize);
+void ari_decode(AriCoder *ariCoder, unsigned char *s, size_t s_len, size_t targetLength, int *out);
+
+Prob* getCode(AriCoder *ariCoder, size_t scaled_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _ArithmeticCoding_H  ----- */
+
diff --git a/deps/SZ/sz/include/ByteToolkit.h b/deps/SZ/sz/include/ByteToolkit.h
new file mode 100644
index 0000000000000000000000000000000000000000..e88bf020f56c92c36777e58b199f9f0450e3461b
--- /dev/null
+++ b/deps/SZ/sz/include/ByteToolkit.h
@@ -0,0 +1,81 @@
+/**
+ *  @file ByteToolkit.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _ByteToolkit_H
+#define _ByteToolkit_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//ByteToolkit.c
+
+unsigned short bytesToUInt16_bigEndian(unsigned char* bytes);
+unsigned int bytesToUInt32_bigEndian(unsigned char* bytes);
+unsigned long bytesToUInt64_bigEndian(unsigned char* b);
+
+short bytesToInt16_bigEndian(unsigned char* bytes);
+int bytesToInt32_bigEndian(unsigned char* bytes);
+long bytesToInt64_bigEndian(unsigned char* b);
+int bytesToInt_bigEndian(unsigned char* bytes);
+
+void intToBytes_bigEndian(unsigned char *b, unsigned int num);
+
+void int64ToBytes_bigEndian(unsigned char *b, uint64_t num);
+void int32ToBytes_bigEndian(unsigned char *b, uint32_t num);
+void int16ToBytes_bigEndian(unsigned char *b, uint16_t num);
+
+long bytesToLong_bigEndian(unsigned char* b);
+void longToBytes_bigEndian(unsigned char *b, unsigned long num);
+long doubleToOSEndianLong(double value);
+int floatToOSEndianInt(float value);
+short getExponent_float(float value);
+short getPrecisionReqLength_float(float precision);
+short getExponent_double(double value);
+short getPrecisionReqLength_double(double precision);
+unsigned char numberOfLeadingZeros_Int(int i);
+unsigned char numberOfLeadingZeros_Long(long i);
+unsigned char getLeadingNumbers_Int(int v1, int v2);
+unsigned char getLeadingNumbers_Long(long v1, long v2);
+short bytesToShort(unsigned char* bytes);
+void shortToBytes(unsigned char* b, short value);
+int bytesToInt(unsigned char* bytes);
+long bytesToLong(unsigned char* bytes);
+float bytesToFloat(unsigned char* bytes);
+void floatToBytes(unsigned char *b, float num);
+double bytesToDouble(unsigned char* bytes);
+void doubleToBytes(unsigned char *b, double num);
+int extractBytes(unsigned char* byteArray, size_t k, int validLength);
+int getMaskRightCode(int m);
+int getLeftMovingCode(int kMod8);
+int getRightMovingSteps(int kMod8, int resiBitLength);
+int getRightMovingCode(int kMod8, int resiBitLength);
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength);
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength);
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes);
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes);
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes);
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes);
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes);
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes);
+
+size_t bytesToSize(unsigned char* bytes);
+void sizeToBytes(unsigned char* outBytes, size_t size);
+
+void put_codes_to_output(unsigned int buf, int bitSize, unsigned char** p, int* lackBits, size_t *outSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _ByteToolkit_H  ----- */
+
diff --git a/deps/SZ/sz/include/CacheTable.h b/deps/SZ/sz/include/CacheTable.h
new file mode 100644
index 0000000000000000000000000000000000000000..f98c8bc75a49eb8e0e275cb3d8938b11f9653dc9
--- /dev/null
+++ b/deps/SZ/sz/include/CacheTable.h
@@ -0,0 +1,40 @@
+/**
+ *  @file CacheTable.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef SZ_MASTER_CACHETABLE_H
+#define SZ_MASTER_CACHETABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "stdio.h"
+#include "stdint.h"
+#include <math.h>
+
+extern double* g_CacheTable;
+extern uint32_t * g_InverseTable;
+extern uint32_t baseIndex;
+extern uint32_t topIndex;
+extern int bits;
+
+int doubleGetExpo(double d);
+int CacheTableGetRequiredBits(double precision, int quantization_intervals);
+uint32_t CacheTableGetIndex(float value, int bits);
+uint64_t CacheTableGetIndexDouble(double value, int bits);
+int CacheTableIsInBoundary(uint32_t index);
+void CacheTableBuild(double * table, int count, double smallest, double largest, double precision, int quantization_intervals);
+uint32_t CacheTableFind(uint32_t index);
+void CacheTableFree();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //SZ_MASTER_CACHETABLE_H
diff --git a/deps/SZ/sz/include/CompressElement.h b/deps/SZ/sz/include/CompressElement.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d388c1218bc7928d80aad8ffb7ef506d95b04f9
--- /dev/null
+++ b/deps/SZ/sz/include/CompressElement.h
@@ -0,0 +1,76 @@
+/**
+ *  @file CompressElement.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Compress Elements such as DoubleCompressELement.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdint.h>
+
+#ifndef _CompressElement_H
+#define _CompressElement_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct DoubleValueCompressElement
+{
+	double data;
+	long curValue;
+	unsigned char curBytes[8]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} DoubleValueCompressElement;
+
+typedef struct FloatValueCompressElement
+{
+	float data;
+	int curValue;
+	unsigned char curBytes[4]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} FloatValueCompressElement;
+
+typedef struct LossyCompressionElement
+{
+	int leadingZeroBytes; //0,1,2,or 3
+	unsigned char integerMidBytes[8];
+	int integerMidBytes_Length; //they are mid_bits actually
+	//char curBytes[8];
+	//int curBytes_Length; //4 for single_precision or 8 for double_precision	
+	int resMidBitsLength;
+	int residualMidBits;
+} LossyCompressionElement;
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+
+short computeGroupNum_float(float value);
+short computeGroupNum_double(double value);
+
+void listAdd_double(double last3CmprsData[3], double value);
+void listAdd_float(float last3CmprsData[3], float value);
+void listAdd_int(int64_t last3CmprsData[3], int64_t value);
+void listAdd_int32(int32_t last3CmprsData[3], int32_t value);
+void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID);
+void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID);
+
+int validPrediction_double(double minErr, double precision);
+int validPrediction_float(float minErr, float precision);
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound);
+int generateGroupMaxIntervalCount(double* groupErrBounds);
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+		int intMidBytes_Length, int resiMidBitsLength, int resiBits);
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CompressElement_H  ----- */
diff --git a/deps/SZ/sz/include/DynamicByteArray.h b/deps/SZ/sz/include/DynamicByteArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..717097940fc9f7772382d7b7ebc8934b697df34b
--- /dev/null
+++ b/deps/SZ/sz/include/DynamicByteArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicByteArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Byte Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicByteArray_H
+#define _DynamicByteArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicByteArray
+{	
+	unsigned char* array;
+	size_t size;
+	size_t capacity;
+} DynamicByteArray;
+
+void new_DBA(DynamicByteArray **dba, size_t cap);
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes);
+void free_DBA(DynamicByteArray *dba);
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos);
+void addDBA_Data(DynamicByteArray *dba, unsigned char value);
+void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicByteArray_H  ----- */
diff --git a/deps/SZ/sz/include/DynamicDoubleArray.h b/deps/SZ/sz/include/DynamicDoubleArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a3ef4b6a4d74fa2a54f6b6ea5ceb82b2bed6e53
--- /dev/null
+++ b/deps/SZ/sz/include/DynamicDoubleArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicDoubleArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Double Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicDoubleArray_H
+#define _DynamicDoubleArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct DynamicDoubleArray
+{	
+	double* array;
+	size_t size;
+	double capacity;
+} DynamicDoubleArray;
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap);
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data);
+void free_DDA(DynamicDoubleArray *dda);
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos);
+void addDDA_Data(DynamicDoubleArray *dda, double value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicDoubleArray_H  ----- */
diff --git a/deps/SZ/sz/include/DynamicFloatArray.h b/deps/SZ/sz/include/DynamicFloatArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..2770f7860bc1da9eab4a478599537e7c29ec4a7f
--- /dev/null
+++ b/deps/SZ/sz/include/DynamicFloatArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicFloatArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Float Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicFloatArray_H
+#define _DynamicFloatArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicFloatArray
+{	
+	float* array;
+	size_t size;
+	size_t capacity;
+} DynamicFloatArray;
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap);
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data);
+void free_DFA(DynamicFloatArray *dfa);
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos);
+void addDFA_Data(DynamicFloatArray *dfa, float value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicFloatArray_H  ----- */
diff --git a/deps/SZ/sz/include/DynamicIntArray.h b/deps/SZ/sz/include/DynamicIntArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..c821c5712728ff816ad0643d9bd1a2bbfaff8e85
--- /dev/null
+++ b/deps/SZ/sz/include/DynamicIntArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicIntArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Int Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicIntArray_H
+#define _DynamicIntArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicIntArray
+{	
+	unsigned char* array; //char* (one byte) is enough, don't have to be int*
+	size_t size;
+	size_t capacity;
+} DynamicIntArray;
+
+void new_DIA(DynamicIntArray **dia, size_t cap);
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data);
+void free_DIA(DynamicIntArray *dia);
+int getDIA_Data(DynamicIntArray *dia, size_t pos);
+void addDIA_Data(DynamicIntArray *dia, int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicIntArray_H  ----- */
diff --git a/deps/SZ/sz/include/Huffman.h b/deps/SZ/sz/include/Huffman.h
new file mode 100644
index 0000000000000000000000000000000000000000..650d6dd7e9956e0f53b51dee41154ff5bda2229e
--- /dev/null
+++ b/deps/SZ/sz/include/Huffman.h
@@ -0,0 +1,75 @@
+/**
+ *  @file Huffman.h
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Header file for the exponential segment constructor.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Huffman_H
+#define _Huffman_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Note: when changing the following settings, intvCapacity in sz.h should be changed as well.
+//#define allNodes 131072
+//#define stateNum 65536
+
+typedef struct node_t {
+	struct node_t *left, *right;
+	size_t freq;
+	char t; //in_node:0; otherwise:1
+	unsigned int c;
+} *node;
+
+typedef struct HuffmanTree {
+	unsigned int stateNum;
+	unsigned int allNodes;
+	struct node_t* pool;
+	node *qqq, *qq; //the root node of the HuffmanTree is qq[1]
+	int n_nodes; //n_nodes is for compression
+	int qend; 
+	unsigned long **code;
+	unsigned char *cout;
+	int n_inode; //n_inode is for decompression
+	int maxBitCount;
+} HuffmanTree;
+
+HuffmanTree* createHuffmanTree(int stateNum);
+HuffmanTree* createDefaultHuffmanTree();
+
+node new_node(HuffmanTree *huffmanTree, size_t freq, unsigned int c, node a, node b);
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t);
+void qinsert(HuffmanTree *huffmanTree, node n);
+node qremove(HuffmanTree *huffmanTree);
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2);
+void init(HuffmanTree *huffmanTree, int *s, size_t length);
+void init_static(HuffmanTree *huffmanTree, int *s, size_t length);
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize);
+
+void decode(unsigned char *s, size_t targetLength, node t, int *out);
+void decode_MSST19(unsigned char *s, size_t targetLength, node t, int *out, int maxBits);
+
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out);
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root);
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount);
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize);
+int encode_withTree_MSST19(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize);
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out);
+void decode_withTree_MSST19(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out, int maxBits);
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/deps/SZ/sz/include/MultiLevelCacheTable.h b/deps/SZ/sz/include/MultiLevelCacheTable.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ecc931fdeafc8ceaa8085bcc4e062b047ddbf1c
--- /dev/null
+++ b/deps/SZ/sz/include/MultiLevelCacheTable.h
@@ -0,0 +1,50 @@
+/**
+ *  @file MultiLevelCacheTable.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _MULTILEVELCACHETABLE_H
+#define _MULTILEVELCACHETABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTable{
+    uint32_t baseIndex;
+    uint32_t topIndex;
+    uint32_t* table;
+    uint8_t expoIndex;
+} SubLevelTable;
+
+typedef struct TopLevelTable{
+    uint8_t bits;
+    uint8_t baseIndex;
+    uint8_t topIndex;
+    struct SubLevelTable* subTables;
+    float bottomBoundary;
+    float topBoundary;
+} TopLevelTable;
+
+uint8_t MLCT_GetExpoIndex(float value);
+uint8_t MLCT_GetRequiredBits(float precision);
+uint32_t MLCT_GetMantiIndex(float value, int bits);
+float MLTC_RebuildFloat(uint8_t expo, uint32_t manti, int bits);
+void MultiLevelCacheTableBuild(struct TopLevelTable* topTable, float* precisionTable, int count, float precision);
+uint32_t MultiLevelCacheTableGetIndex(float value, struct TopLevelTable* topLevelTable);
+void MultiLevelCacheTableFree(struct TopLevelTable* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLE_H
diff --git a/deps/SZ/sz/include/MultiLevelCacheTableWideInterval.h b/deps/SZ/sz/include/MultiLevelCacheTableWideInterval.h
new file mode 100644
index 0000000000000000000000000000000000000000..853d14bcee6f3b6f97998891149103292ce0ef9e
--- /dev/null
+++ b/deps/SZ/sz/include/MultiLevelCacheTableWideInterval.h
@@ -0,0 +1,54 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file for MultiLevelCacheTableWideInterval.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#ifndef _MULTILEVELCACHETABLEWIDEINTERVAL_H
+#define _MULTILEVELCACHETABLEWIDEINTERVAL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTableWideInterval{
+    uint64_t baseIndex;
+    uint64_t topIndex;
+    uint16_t* table;
+    uint16_t expoIndex;
+} SubLevelTableWideInterval;
+
+typedef struct TopLevelTableWideInterval{
+    uint16_t bits;
+    uint16_t baseIndex;
+    uint16_t topIndex;
+    struct SubLevelTableWideInterval* subTables;
+    double bottomBoundary;
+    double topBoundary;
+} TopLevelTableWideInterval;
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable);
+
+uint16_t MLCTWI_GetExpoIndex(double value);
+uint16_t MLCTWI_GetRequiredBits(double precision);
+uint64_t MLCTWI_GetMantiIndex(double value, int bits);
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits);
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits);
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable);
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLEWIDEINTERVAL_H
diff --git a/deps/SZ/sz/include/TightDataPointStorageD.h b/deps/SZ/sz/include/TightDataPointStorageD.h
new file mode 100644
index 0000000000000000000000000000000000000000..188dfe1556cfb2903fa5dc2ddb8a8c8ed768960c
--- /dev/null
+++ b/deps/SZ/sz/include/TightDataPointStorageD.h
@@ -0,0 +1,99 @@
+/**
+ *  @file TightDataPointStorageD.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageD_H
+#define _TightDataPointStorageD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TightDataPointStorageD
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision;
+	double medianValue;
+	char reqLength;	
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+
+	double minLogValue;
+
+	int stateNum;
+	int allNodes;
+
+	size_t exactDataNum;
+	double reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals;
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+		
+	unsigned char* raBytes;
+	size_t raBytes_size;
+	
+	unsigned char plus_bits;
+	unsigned char max_bits;
+	
+} TightDataPointStorageD;
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **self);
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageD(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps);
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageD_H  ----- */
diff --git a/deps/SZ/sz/include/TightDataPointStorageF.h b/deps/SZ/sz/include/TightDataPointStorageF.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b61c02e22941984a06c0a069c78b081812dd842
--- /dev/null
+++ b/deps/SZ/sz/include/TightDataPointStorageF.h
@@ -0,0 +1,105 @@
+/**
+ *  @file TightDataPointStorageF.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageF_H
+#define _TightDataPointStorageF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageF
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	float medianValue;
+	char reqLength;
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+	
+	int stateNum;
+	int allNodes;
+	
+	size_t exactDataNum;
+	float reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	float minLogValue;
+
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+	
+	unsigned char* raBytes;
+	size_t raBytes_size;
+
+	unsigned char plus_bits;
+	unsigned char max_bits;
+	
+} TightDataPointStorageF;
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **self);
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageF(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum,
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength,
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+/**
+ * This function is designed for first-version of the point-wise relative error bound (developed by Sheng Di for TPDS18 paper)
+ * 
+ * */
+void new_TightDataPointStorageF2(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps);
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageF_H  ----- */
diff --git a/deps/SZ/sz/include/TightDataPointStorageI.h b/deps/SZ/sz/include/TightDataPointStorageI.h
new file mode 100644
index 0000000000000000000000000000000000000000..466a753e0e7a751d9a8d544adcaf8ac63fab9131
--- /dev/null
+++ b/deps/SZ/sz/include/TightDataPointStorageI.h
@@ -0,0 +1,65 @@
+/**
+ *  @file TightDataPointStorageI.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2017
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageI_H
+#define _TightDataPointStorageI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageI
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	size_t exactDataNum;
+	long minValue;
+	int exactByteSize;
+	int dataTypeSize; //the size of data type, e.g., it's 4 when data type is int32_t
+	
+	int stateNum;
+	int allNodes;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* exactDataBytes;
+	size_t exactDataBytes_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+
+} TightDataPointStorageI;
+
+int computeRightShiftBits(int exactByteSize, int dataType);
+int convertDataTypeSizeCode(int dataTypeSizeCode);
+int convertDataTypeSize(int dataTypeSize);
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **self);
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **self, unsigned char* flatBytes, size_t flatBytesLength);
+void new_TightDataPointStorageI(TightDataPointStorageI **self,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType);
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size);
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps);
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageI_H  ----- */
diff --git a/deps/SZ/sz/include/TypeManager.h b/deps/SZ/sz/include/TypeManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c543d3f82aab34cad5ad8eb44e779872f02cf86
--- /dev/null
+++ b/deps/SZ/sz/include/TypeManager.h
@@ -0,0 +1,40 @@
+/**
+ *  @file TypeManager.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the TypeManager.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TypeManager_H
+#define _TypeManager_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+//TypeManager.c
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result);
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength);
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes);
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes);
+int computeBitNumRequired(size_t dataLength);
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TypeManager_H  ----- */
+
diff --git a/deps/SZ/sz/include/VarSet.h b/deps/SZ/sz/include/VarSet.h
new file mode 100644
index 0000000000000000000000000000000000000000..c991d72bfc1cbe1c0d184f386b6b9b8aa8bddb0b
--- /dev/null
+++ b/deps/SZ/sz/include/VarSet.h
@@ -0,0 +1,84 @@
+/**
+ *  @file VarSet.h
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief Header file for the Variable.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _VarSet_H
+#define _VarSet_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct sz_multisteps
+{
+	char compressionType;
+	int predictionMode;
+	int lastSnapshotStep; //the previous snapshot step
+	unsigned int currentStep; //current time step of the execution/simulation
+	
+	//void* ori_data; //original data pointer, which serve as the key for retrieving hist_data
+	void* hist_data; //historical data in past time steps
+} sz_multisteps;
+
+typedef struct SZ_Variable
+{
+	unsigned char var_id;
+	char* varName;
+	char compressType; //102 means HZ; 101 means SZ 
+	int dataType; //SZ_FLOAT or SZ_DOUBLE
+	size_t r5;
+	size_t r4;
+	size_t r3;
+	size_t r2;
+	size_t r1;
+	int errBoundMode;
+	double absErrBound;
+	double relBoundRatio;
+	double pwRelBoundRatio;
+	void* data;
+	sz_multisteps *multisteps;
+	unsigned char* compressedBytes;
+	size_t compressedSize;
+	struct SZ_Variable* next;
+} SZ_Variable;
+
+typedef struct SZ_VarSet
+{
+	unsigned short count;
+	struct SZ_Variable *header;
+	struct SZ_Variable *lastVar;
+} SZ_VarSet;
+
+void free_Variable_keepOriginalData(SZ_Variable* v);
+void free_Variable_keepCompressedBytes(SZ_Variable* v);
+void free_Variable_all(SZ_Variable* v);
+void SZ_batchAddVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio,
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName);
+int SZ_batchDelVar(char* varName);
+int SZ_batchDelVar_ID_vset(SZ_VarSet* vset, int var_id);
+int SZ_batchDelVar_ID(int var_id);
+
+SZ_Variable* SZ_searchVar(char* varName);
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1);
+
+void free_VarSet_vset(SZ_VarSet *vset, int mode);
+void SZ_freeVarSet(int mode);
+
+void free_multisteps(sz_multisteps* multisteps);
+int checkVarID(unsigned char cur_var_id, unsigned char* var_ids, int var_count);
+SZ_Variable* SZ_getVariable(int var_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _VarSet_H  ----- */
diff --git a/deps/SZ/sz/include/callZlib.h b/deps/SZ/sz/include/callZlib.h
new file mode 100644
index 0000000000000000000000000000000000000000..1aede548c6e5d7aa30475799d1af994a3ccddad4
--- /dev/null
+++ b/deps/SZ/sz/include/callZlib.h
@@ -0,0 +1,44 @@
+/**
+ *  @file callZlib.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the callZlib.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _CallZlib_H
+#define _CallZlib_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define SZ_ZLIB_BUFFER_SIZE 1048576	
+#define SZ_ZLIB_BUFFER_SIZE 65536
+
+#include <stdio.h>
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2);
+
+//callZlib.c
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level);
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress2(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CallZlib_H  ----- */
+
diff --git a/deps/SZ/sz/include/conf.h b/deps/SZ/sz/include/conf.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f65403194f72a3f2c9a6114db85b6d3d3ecb87f
--- /dev/null
+++ b/deps/SZ/sz/include/conf.h
@@ -0,0 +1,37 @@
+/**
+ *  @file conf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the conf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Conf_H
+#define _Conf_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//conf.c
+void updateQuantizationInfo(int quant_intervals);
+int SZ_ReadConf(const char* sz_cfgFile);
+int SZ_LoadConf(const char* sz_cfgFile);
+int checkVersion(char* version);
+int computeVersion(int major, int minor, int revision);
+int checkVersion2(char* version);
+
+void initSZ_TSC();
+unsigned int roundUpToPowerOf2(unsigned int base);
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range);
+double computeABSErrBoundFromNORM_ERR(double normErr, size_t nbEle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _Conf_H  ----- */
+
diff --git a/deps/SZ/sz/include/dataCompression.h b/deps/SZ/sz/include/dataCompression.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e4993958984970c22db72d11c455544e41e6a1a
--- /dev/null
+++ b/deps/SZ/sz/include/dataCompression.h
@@ -0,0 +1,104 @@
+/**
+ *  @file dataCompression.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DataCompression_H
+#define _DataCompression_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sz.h"
+#include <stdio.h>
+#include <stdbool.h>
+
+#define computeMinMax(data) \
+        for(i=1;i<size;i++)\
+        {\
+                data_ = data[i];\
+                if(min>data_)\
+                        min = data_;\
+                else if(max<data_)\
+                        max = data_;\
+        }\
+
+
+//dataCompression.c
+int computeByteSizePerIntValue(long valueRangeSize);
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize);
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue);
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue);
+float computeRangeSize_float_MSST19(float* oriData, size_t size, float* valueRangeSize, float* medianValue, unsigned char * signs, bool* positive, float* nearZero);
+double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero);
+
+double computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+double min_d(double a, double b);
+double max_d(double a, double b);
+float min_f(float a, float b);
+float max_f(float a, float b);
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+void symTransform_8bytes(unsigned char data[8]);
+void symTransform_2bytes(unsigned char data[2]);
+void symTransform_4bytes(unsigned char data[4]);
+
+void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes);
+void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes);
+void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes);
+void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleFloatValue_MSST19(FloatValueCompressElement *vce, float tgtValue, float precision, int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleDoubleValue_MSST19(DoubleValueCompressElement *vce, double tgtValue, double precision, int reqLength, int reqBytesLength, int resiBitsLength);
+                              
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes);
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes);
+void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce);
+
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status);
+
+int computeBlockEdgeSize_3D(int segmentSize);
+int computeBlockEdgeSize_2D(int segmentSize);
+int initRandomAccessBytes(unsigned char* raBytes);
+
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData);
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue);
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData);
+
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData);
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue);
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DataCompression_H  ----- */
+
diff --git a/deps/SZ/sz/include/defines.h b/deps/SZ/sz/include/defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6fbbafafb845ca9b00fa6a6e6dccb9f5294fe37
--- /dev/null
+++ b/deps/SZ/sz/include/defines.h
@@ -0,0 +1,106 @@
+/**
+ *  @file defines.h
+ *  @author Sheng Di
+ *  @date July, 2019
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_DEFINES_H
+#define _SZ_DEFINES_H
+
+#define SZ_VERNUM 0x0200
+#define SZ_VER_MAJOR 2
+#define SZ_VER_MINOR 1
+#define SZ_VER_BUILD 12
+#define SZ_VER_REVISION 0
+
+#define PASTRI 103
+#define HZ 102 //deprecated
+#define SZ 101
+#define SZ_Transpose 104
+
+//prediction mode of temporal dimension based compression
+#define SZ_PREVIOUS_VALUE_ESTIMATE 0
+
+#define MIN_NUM_OF_ELEMENTS 20 //if the # elements <= 20, skip the compression
+
+#define ABS 0
+#define REL 1
+#define VR_REL 1  //alternative name to REL
+#define ABS_AND_REL 2
+#define ABS_OR_REL 3
+#define PSNR 4
+#define NORM 5
+
+#define PW_REL 10
+#define ABS_AND_PW_REL 11
+#define ABS_OR_PW_REL 12
+#define REL_AND_PW_REL 13
+#define REL_OR_PW_REL 14
+
+#define SZ_FLOAT 0
+#define SZ_DOUBLE 1
+#define SZ_UINT8 2
+#define SZ_INT8 3
+#define SZ_UINT16 4
+#define SZ_INT16 5
+#define SZ_UINT32 6
+#define SZ_INT32 7
+#define SZ_UINT64 8
+#define SZ_INT64 9
+
+#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
+#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
+
+#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
+#define BIG_ENDIAN_SYSTEM 1
+
+#define DynArrayInitLen 1024
+
+#define MIN_ZLIB_DEC_ALLOMEM_BYTES 1000000
+
+//#define maxRangeRadius 32768
+//#define maxRangeRadius 1048576//131072
+
+#define SZ_BEST_SPEED 0
+#define SZ_BEST_COMPRESSION 1
+#define SZ_DEFAULT_COMPRESSION 2
+#define SZ_TEMPORAL_COMPRESSION 3
+
+#define SZ_NO_REGRESSION 0
+#define SZ_WITH_LINEAR_REGRESSION 1
+
+#define SZ_PWR_MIN_TYPE 0
+#define SZ_PWR_AVG_TYPE 1
+#define SZ_PWR_MAX_TYPE 2
+
+#define SZ_FORCE_SNAPSHOT_COMPRESSION 0
+#define SZ_FORCE_TEMPORAL_COMPRESSION 1
+#define SZ_PERIO_TEMPORAL_COMPRESSION 2
+
+//SUCCESS returning status
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define SZ_DERR -4 //dimension error
+#define SZ_MERR -5 //sz_mode error
+#define SZ_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
+
+#define SZ_MAINTAIN_VAR_DATA 0
+#define SZ_DESTROY_WHOLE_VARSET 1
+
+#define GROUP_COUNT 16 //2^{16}=65536
+	
+#define MetaDataByteLength 28
+#define MetaDataByteLength_double 36 //meta data length for double type
+	
+#define numOfBufferedSteps 1 //the number of time steps in the buffer	
+
+
+#define GZIP_COMPRESSOR 0 //i.e., ZLIB_COMPRSSOR
+#define ZSTD_COMPRESSOR 1
+
+#endif /* _SZ_DEFINES_H */
diff --git a/deps/SZ/sz/include/dictionary.h b/deps/SZ/sz/include/dictionary.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cf326ade2751b2e759bdaac7d8ac72b5eefcabf
--- /dev/null
+++ b/deps/SZ/sz/include/dictionary.h
@@ -0,0 +1,172 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.h
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _DICTIONARY_H_
+#define _DICTIONARY_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*---------------------------------------------------------------------------
+                                New types
+ ---------------------------------------------------------------------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dictionary object
+
+  This object contains a list of string/string associations. Each
+  association is identified by a unique string key. Looking up values
+  in the dictionary is speeded up by the use of a (hopefully collision-free)
+  hash function.
+ */
+/*-------------------------------------------------------------------------*/
+typedef struct _dictionary_ {
+    int             n ;     /** Number of entries in dictionary */
+    int             size ;  /** Storage size */
+    char        **  val ;   /** List of string values */
+    char        **  key ;   /** List of string keys */
+    unsigned     *  hash ;  /** List of hash values for keys */
+} dictionary ;
+
+
+/*---------------------------------------------------------------------------
+                            Function prototypes
+ ---------------------------------------------------------------------------*/
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * vd);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * vd, const char * key, const char * val);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/deps/SZ/sz/include/exafelSZ.h b/deps/SZ/sz/include/exafelSZ.h
new file mode 100644
index 0000000000000000000000000000000000000000..06e9921a2842ccd7f2caf4a7f9ef6abe274b9b99
--- /dev/null
+++ b/deps/SZ/sz/include/exafelSZ.h
@@ -0,0 +1,57 @@
+#ifndef EXAFELSZ_H
+#define EXAFELSZ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct exafelSZ_params{
+  //uint8_t *peaks;
+  uint16_t *peaksSegs;
+  uint16_t *peaksRows;
+  uint16_t *peaksCols;
+  uint64_t numPeaks;
+
+  uint8_t *calibPanel;
+
+  uint8_t binSize; //Binning: (pr->binSize x pr->binSize) to (1 x 1)
+  double tolerance; //SZ pr->tolerance
+  uint8_t szDim; //1D/2D/3D compression/decompression
+  //uint8_t szBlockSize; //Currently unused
+  uint8_t peakSize; //MUST BE ODD AND NOT EVEN! Each peak will have size of: (peakSize x peakSize)
+ 
+  // uint64_t nEvents;
+  // uint64_t panels;
+  // uint64_t rows;
+  // uint64_t cols;
+  
+  //CALCULATED VARIBALES:
+  uint64_t binnedRows;
+  uint64_t binnedCols;
+  uint8_t peakRadius; //Will be calculated using peakSize
+
+} exafelSZ_params;
+
+
+void exafelSZ_params_process(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+void exafelSZ_params_checkDecomp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+void exafelSZ_params_checkComp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+
+unsigned char * exafelSZ_Compress(void* _pr,
+                         void* _origData,
+                        size_t events, size_t panels, size_t rows, size_t cols,
+                        size_t *compressedSize);
+					   
+void* exafelSZ_Decompress(void *_pr,
+                         unsigned char*_compressedBuffer,
+                         size_t events, size_t panels, size_t rows, size_t cols,
+                         size_t compressedSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _EXAFELSZ_H  ----- */
+
diff --git a/deps/SZ/sz/include/iniparser.h b/deps/SZ/sz/include/iniparser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ddb907c2e4af917e82b6503db0c9a00032ad38f
--- /dev/null
+++ b/deps/SZ/sz/include/iniparser.h
@@ -0,0 +1,321 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.h
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _INIPARSER_H_
+#define _INIPARSER_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * The following #include is necessary on many Unixes but not Linux.
+ * It is not needed for Windows platforms.
+ * Uncomment it if needed.
+ */
+/* #include <unistd.h> */
+
+#include "dictionary.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+int iniparser_getnsec(dictionary * d);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+char * iniparser_getsecname(dictionary * d, int n);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dump_ini(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  - "42"      ->  42
+  - "042"     ->  34 (octal -> decimal)
+  - "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(dictionary * ini, const char * entry) ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d);
+
+#endif
diff --git a/deps/SZ/sz/include/pastri.h b/deps/SZ/sz/include/pastri.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5f2e90ddef1084ff262f88de1556967235913b2
--- /dev/null
+++ b/deps/SZ/sz/include/pastri.h
@@ -0,0 +1,140 @@
+//CHECK:
+//What happens when ECQBits==1, or ECQBits==0 or ECQBits<0?
+//Rounding? Scale originalEb by 0.99?
+
+//Possible improvement: Change GAMESS format: {i i i i d} -> {i}{i}{i}{i}{d}
+//Possible improvement: Optimize bookkeeping bits
+//Possible improvement: Guess the type (C/UC, Sparse/Not)
+//Possible improvement: Get rid of writing/reading some of the indexes to in/out buffers
+//Possible improvement: Get rid of all debug stuff, including Makefile debug flags
+//Possible improvement: Get rid of "compressedBytes"
+//Possible improvement: SparseCompressed, ECQBits=2: 1's and -1's can be represented by just 0 and 1, instead 10 and 11. 
+//Possible improvement: SparseCompressed, ECQBits>2: Again: 1: 10, -1:11, Others: 0XX...XX 
+//Possible improvement: WriteBitsFast: maybe remove some masks?
+//Possible improvement: WriteBitsFast: Get rid of multiple calls!
+//Possible improvement: UCSparse: Indexes use 64 bits. It can be lowered to _1DIdxBits
+//Possible improvement: Parameters: Smaller data sizes may be possible!
+
+
+
+#ifndef PASTRI_H
+#define PASTRI_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h> //Just for debugging purposes!
+
+//#define DATASIZE 8 //Bytes per input data point.
+//We have only 1 double per data point, so it is 8 bytes.
+
+#define MAX_PS_SIZE 100
+#define MAX_BLOCK_SIZE 10000
+#define MAX_BUFSIZE 160000  //Should be a multiple of 8
+#define D_W 0 //Debug switch: Write (input block)
+#define D_R 0 //Debug switch: Read (compressed block)
+#define D_G 0 //Debug switch: General
+#define D_G2 0 //Debug switch: General 2 (a little more detail)
+#define D_C 0 //Debug switch: C
+//#define DEBUG 1 //Debug switch
+
+//#define BOOKKEEPINGBITS 0 //Currently unused
+//#define BOOKKEEPINGBITS 120 //Includes: mode, indexOffsets, compressedBytes, Pb_, ECQBits_ (8+64+32+8+8) 
+//BOOKKEEPINGBITS is defined here, because if P & S is going to be used, they appear just after the bookkeeping part.
+//This allows us to write P and S directly onto using outBuf.
+  
+
+// IMPORTANT NOTE:
+//Read/Write up to 56 bits.
+//More than that is not supported!
+
+
+/********************************************************************/
+//Datatype Declarations:
+/********************************************************************/
+typedef struct pastri_params{
+  double originalEb; //Error Bound entered by the user
+  double usedEb; //Error Bound used during compression/deceompression
+  
+  int numBlocks; //Number of blocks to be compressed
+  int dataSize; //8(=Double) or 4(=Float)
+  
+  int bf[4]; //Orbital types (basis function types). Typically in range [0,3]
+  int idxRange[4];  //Ranges of indexes. idxRange[i]=(bf[i]+1)*(bf[i]+2)/2;
+  
+  int sbSize; //=idxRange[2]*idxRange[3];
+  int sbNum;  //=idxRange[0]*idxRange[1];
+  int bSize; //=sbSize*sbNum;
+  
+  //uint16_t idxOffset[4]; //Index offset values. No longer used.
+  
+}pastri_params;
+
+//Block-specific stuff:
+typedef struct pastri_blockParams{
+  uint16_t nonZeros;
+  //int ECQ0s; //= p->bSize - numOutliers //OR: p->bSize=ECQ0s+ECQ1s+ECQOthers
+  int ECQ1s;
+  int ECQOthers;
+  int numOutliers; //=ECQ1s+ECQOthers
+  int patternBits;
+  int scaleBits;
+  double binSize;
+  double scalesBinSize;
+  uint64_t ECQExt;
+  int ECQBits;
+  int _1DIdxBits;
+}pastri_blockParams;
+
+typedef union u_UI64I64D{
+  uint64_t ui64;
+  int64_t i64;
+  double d;
+} u_UI64I64D;
+
+/********************************************************************/
+//Function Prototypes:
+/********************************************************************/
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr);
+//Read the basic PaSTRI parameters from a file, speficied by paramsFilename.
+
+void SZ_pastriPreprocessParameters(pastri_params *p);
+//Using basic PaSTRI parameters, generate the others.
+//For example, block and sub-block sizes are generated by using basis function types.
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes);
+//INPUTS: p, originalBuf
+//OUTPUTS: compressedBufP, compressedBytes
+//Using the inputs, compressedBufP is allocated and populated by the compressed data. Compressed size is written into compressedBytes.
+//Parameters are also stored at the beginning part of the compressedBuf
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes);
+//INPUTS: compressedBuf
+//OUTPUTS: p, decompressedBufP, decompressedBytes
+//First, parameters are read from compressedBuf and written into p.
+//Then, decompressedBufP is allocated and populated by the decompressed data. Decompressed size is written into decompressedBytes.
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf); 
+//INPUTS: p, originalBuf, decompressedBuf
+//OUTPUTS: None (Just some on-screen messages)
+//Compares originalBuf with decompressedBuf. Checks whether the absolute error condition is satisfied or not.
+
+/********************************************************************/
+//Other Includes:
+/********************************************************************/
+
+
+
+#include "pastriGeneral.h"  //General tools
+#include "pastriD.h"  //Compression/Decompression for Double data
+#include "pastriF.h"  //Compression/Decompression for Float data
+
+
+#endif
+
+
+
+
+
diff --git a/deps/SZ/sz/include/pastriD.h b/deps/SZ/sz/include/pastriD.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ee2813997b308a3c85c13eda317269017e6657a
--- /dev/null
+++ b/deps/SZ/sz/include/pastriD.h
@@ -0,0 +1,911 @@
+#ifndef PASTRID_H
+#define PASTRID_H
+
+static inline int64_t pastri_double_quantize(double x, double binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+//  //printf("pastri_double_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+//  //printf("sign(x):0x%lx\n", x);
+//  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+//  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  double absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+//    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  double patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_double_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_double_quantize(data[patternIdx+i],bp->binSize);
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_double((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_double_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_double_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      double absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      double decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(double*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    //printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",outBuf[17+i]);
+    }
+    //printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_double_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  double *data;
+  data=(double*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_double_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_double_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline double pastri_double_InverseQuantization(int64_t q, double binSize){
+  return q*binSize;
+}
+
+static inline void pastri_double_PredictData(pastri_params *p,pastri_blockParams *bp,double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_double_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_double_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //double *data=(double*)(outBuf+p->bSize*8);
+  double *data=(double*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(double*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                ////if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    ////if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    ////if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                ////if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_double_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_double_Decode function)
+  pastri_double_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_double_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  double *data=(double*)(inBuf);
+  double *data_dc=(double*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
diff --git a/deps/SZ/sz/include/pastriF.h b/deps/SZ/sz/include/pastriF.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c1d5879649e34636e20b383a4ac7bb818eea0e8
--- /dev/null
+++ b/deps/SZ/sz/include/pastriF.h
@@ -0,0 +1,911 @@
+#ifndef PASTRIF_H
+#define PASTRIF_H
+
+static inline int64_t pastri_float_quantize(float x, float binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+  ////printf("pastri_float_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+  ////printf("sign(x):0x%lx\n", x);
+  ////printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+  ////printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  float absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+    ////printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  float patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_float_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_float_quantize(data[patternIdx+i],bp->binSize);
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_float((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_float_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_float_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      float absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      float decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(float*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    //printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",outBuf[17+i]);
+    }
+    //printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_float_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  float *data;
+  data=(float*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_float_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_float_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline float pastri_float_InverseQuantization(int64_t q, float binSize){
+  return q*binSize;
+}
+
+static inline void pastri_float_PredictData(pastri_params *p,pastri_blockParams *bp,float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_float_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_float_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //float *data=(float*)(outBuf+p->bSize*8);
+  float *data=(float*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(float*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                ////if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    ////if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    ////if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                ////if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_float_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_float_Decode function)
+  pastri_float_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_float_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  float *data=(float*)(inBuf);
+  float *data_dc=(float*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
diff --git a/deps/SZ/sz/include/pastriGeneral.h b/deps/SZ/sz/include/pastriGeneral.h
new file mode 100644
index 0000000000000000000000000000000000000000..81149256d65d05f8626418dc360644202550e44e
--- /dev/null
+++ b/deps/SZ/sz/include/pastriGeneral.h
@@ -0,0 +1,205 @@
+#ifndef PASTRIGENERAL_H
+#define PASTRIGENERAL_H
+
+
+static inline double abs_FastD(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  //(*((uint64_t *)(&x)))&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  u1.ui64&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  return u1.d;
+}
+
+static inline int64_t abs_FastI64(int64_t x){
+  return (x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+}
+/*
+int abs(int x) {
+   int mask = (x >> (sizeof(int) * CHAR_BIT - 1));
+   return (x + mask) ^ mask;
+}
+*/
+
+
+
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//Actually to be completely safe, it correspond to: ceil(log2(abs(i)+1))+0.1
+//+0.1 was for fixing rounding errors
+//REMEMBER: To represent the whole range [-x:x], the number of bits required is bitsNeeded(x)+1
+static inline int bitsNeeded_double(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//NEEDS OPTIMIZATION!
+static inline int bitsNeeded_float(float x){
+  u_UI64I64D u1;
+  u1.d=x; //Casting to Double!
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+static inline int bitsNeeded_UI64(uint64_t x){
+  int shift;
+  int res=0;
+  
+  //Get the absolute value of x:
+  //x=(x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+  //x=abs_FastI64(x);
+  
+  //printf("%d\n",(x&(uint64_t)0xFFFFFFFF00000000)!=0);
+  shift=(((x&(uint64_t)0xFFFFFFFF00000000)!=0)*32);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000FFFF0000)!=0);
+  shift=(((x&(uint64_t)0x00000000FFFF0000)!=0)*16);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000FF00)!=0);
+  shift=(((x&(uint64_t)0x000000000000FF00)!=0)*8);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000000000F0)!=0);
+  shift=(((x&(uint64_t)0x00000000000000F0)!=0)*4);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000000C)!=0);
+  shift=(((x&(uint64_t)0x000000000000000C)!=0)*2);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000002)!=0);
+  shift=((x&(uint64_t)0x0000000000000002)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000001)!=0);
+  shift=((x&(uint64_t)0x0000000000000001)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("BITS NEEDED: %d\n",res);
+  return res;
+}
+
+static inline int bitsNeeded_I64(int64_t x){
+  uint64_t ux;
+  ux=abs_FastI64(x);
+  return bitsNeeded_UI64(ux);
+}
+
+//Implementations(They are inline, so they should be in this header file)
+
+static inline int myEndianType(){ //Should work for most cases. May not work at mixed endian systems.
+  uint64_t n=1;
+  if (*(unsigned char*)&n == 1){
+    //cout<<"Little-Endian"<<endl;
+    return 0;  //0 for little endian
+  }
+  else{
+    //cout<<"Big-Endian"<<endl;
+    return 1; //1 for big endian
+  }
+}
+
+static inline void flipBytes_UI64(uint64_t *dataPtr){
+  unsigned char*tempA;
+  char temp8b;
+  tempA=(unsigned char*)dataPtr;
+  temp8b=tempA[7];
+  tempA[7]=tempA[0];
+  tempA[0]=temp8b;
+  temp8b=tempA[6];
+  tempA[6]=tempA[1];
+  tempA[1]=temp8b;
+  temp8b=tempA[5];
+  tempA[5]=tempA[2];
+  tempA[2]=temp8b;
+  temp8b=tempA[4];
+  tempA[4]=tempA[3];
+  tempA[3]=temp8b;
+  return;
+}
+
+//WARNING: readBits works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+
+static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //cout<<"bitPos:"<<(*bitPosPtr)<<"\tbitPos>>3:"<<(*bitPosPtr>>3)<<endl;
+    uint64_t temp64b = *(uint64_t*)(buffer + ( *bitPosPtr >> 3)); 
+    //NOTE: bitPos>>3 is the same as bitPos/8
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    
+    //cout<<endl;
+    //cout<<"bitpos>>3:"<<(bitPos>>3)<<" bitPos&0x7:"<<(bitPos & 0x00000007)<<" bitPos%8:"<<(bitPos%8)<<endl;
+    //cout<<"Read:"<<(temp64b & mask)<<" temp64b:"<<temp64b<<" Mask:"<<mask<<" numBits:"<<numBits<<endl;
+    
+    (*bitPosPtr) += numBits;
+    return (temp64b & mask);
+}
+
+static inline int64_t readBits_I64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+  int64_t val;
+  val=readBits_UI64(buffer,bitPosPtr,numBits);//Read value
+  int64_t shiftAmount=64-numBits;
+  val=(val<<shiftAmount)>>shiftAmount;//Sign correction
+  return val;
+}
+
+//WARNING: readBits_EndianSafe is not tested on Big-Endian machines
+static inline uint64_t readBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    uint64_t temp64b = *(uint64_t*)(buffer + ((*bitPosPtr)>>3)); 
+    //NOTE: (*bitPosPtr)>>3 is the same as (*bitPosPtr)/8
+    if(myEndianType())
+      flipBytes_UI64(&temp64b);
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    (*bitPosPtr) += numBits;
+    return temp64b & mask;
+}
+
+//WARNING: writeBits_Fast works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+//The buffer should be initialized as 0's for this to work!
+//Also, the range of data is not checked!(If data exceeds numBits, it may be cause problems)
+static inline void writeBits_Fast(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,int64_t data){
+    //if(DEBUG){printf("writeBits_Fast: data:0x%lx %ld\n",data,data);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: numBits:0x%lx %ld\n",numBits,numBits);} //DEBUG
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //if(DEBUG){printf("writeBits_Fast: mask:0x%lx %ld\n",mask,mask);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: data&mask:0x%lx %ld\n",((*(uint64_t*)&data)&mask),((*(uint64_t*)&data)&mask));} //DEBUG
+    
+    //if(DEBUG){printf("writeBits_Fast: buffer_O:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint64_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    //if(DEBUG){printf("writeBits_Fast: buffer_N:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+
+    
+    (*bitPosPtr) += numBits;
+}
+
+//WARNING: writeBits_EndianSafe is not tested on Big-Endian machines
+static inline void writeBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,uint64_t data){
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    data=data&mask;
+    uint64_t temp64b_inBuffer=*(uint64_t*)(buffer + ((*bitPosPtr)>>3));
+    uint64_t temp64b_outBuffer=data << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_inBuffer);
+    }
+    temp64b_outBuffer |= temp64b_inBuffer;
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_outBuffer);
+    }
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3))=temp64b_outBuffer;  // "|=" may also work
+    (*bitPosPtr) += numBits;
+}
+
+
+#endif
diff --git a/deps/SZ/sz/include/rw.h b/deps/SZ/sz/include/rw.h
new file mode 100644
index 0000000000000000000000000000000000000000..846243de5e0fe58a266e8f62f487649b60cb2ebb
--- /dev/null
+++ b/deps/SZ/sz/include/rw.h
@@ -0,0 +1,89 @@
+/**
+ *  @file io.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole io interface.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _IO_H
+#define _IO_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int checkFileExistance(char* filePath);
+
+float** create2DArray_float(size_t m, size_t n);
+void free2DArray_float(float** data, size_t m);
+float*** create3DArray_float(size_t p, size_t m, size_t n);
+void free3DArray_float(float*** data, size_t p, size_t m);
+double** create2DArray_double(size_t m, size_t n);
+void free2DArray_double(double** data, size_t m);
+double*** create3DArray_double(size_t p, size_t m, size_t n);
+void free3DArray_double(double*** data, size_t p, size_t m);
+size_t checkFileSize(char *srcFilePath, int *status);
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status);
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status);
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status);
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status);
+
+//void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status);
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize);
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength);
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle);
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle);
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len);
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len);
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _IO_H  ----- */
diff --git a/deps/SZ/sz/include/sz.h b/deps/SZ/sz/include/sz.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bf7fa357f5597864646fc284e218525e2bfd8e4
--- /dev/null
+++ b/deps/SZ/sz/include/sz.h
@@ -0,0 +1,337 @@
+/**
+ *  @file sz.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole compressor.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_H
+#define _SZ_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>      /* For gettimeofday(), in microseconds */
+#include <time.h>          /* For time(), in seconds */
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "VarSet.h"
+#include "Huffman.h"
+#include "TightDataPointStorageD.h"
+#include "TightDataPointStorageF.h"
+#include "TightDataPointStorageI.h"
+#include "conf.h"
+#include "dataCompression.h"
+#include "ByteToolkit.h"
+#include "TypeManager.h"
+#include "sz_int8.h"
+#include "sz_int16.h"
+#include "sz_int32.h"
+#include "sz_int64.h"
+#include "sz_uint8.h"
+#include "sz_uint16.h"
+#include "sz_uint32.h"
+#include "sz_uint64.h"
+#include "sz_float.h"
+#include "sz_double.h"
+#include "szd_int8.h"
+#include "szd_int16.h"
+#include "szd_int32.h"
+#include "szd_int64.h"
+#include "szd_uint8.h"
+#include "szd_uint16.h"
+#include "szd_uint32.h"
+#include "szd_uint64.h"
+#include "szd_float.h"
+#include "szd_double.h"
+#include "sz_float_pwr.h"
+#include "sz_double_pwr.h"
+#include "sz_opencl.h"
+#include "callZlib.h"
+#include "rw.h"
+#include "pastri.h"
+#include "sz_float_ts.h"
+#include "szd_float_ts.h"
+#include "utility.h"
+#include "CacheTable.h"
+#include "MultiLevelCacheTable.h"
+#include "MultiLevelCacheTableWideInterval.h"
+#include "exafelSZ.h"
+#include "sz_stats.h"
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//typedef char int8_t;
+//typedef unsigned char uint8_t;
+//typedef short int16_t;
+//typedef unsigned short uint16_t;
+//typedef int int32_t;
+//typedef unsigned int uint32_t;
+//typedef long int64_t;
+//typedef unsigned long uint64_t;
+
+#include "defines.h"
+	
+//Note: the following setting should be consistent with stateNum in Huffman.h
+//#define intvCapacity 65536
+//#define intvRadius 32768
+//#define intvCapacity 131072
+//#define intvRadius 65536
+
+#define SZ_COMPUTE_1D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                  \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;       \
+    }                                   \
+
+#define SZ_COMPUTE_2D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_3D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX,       \
+                                       EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
+    EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS;               \
+    SPLIT_INDEX = COUNT % NUM_BLOCKS;                                        \
+    if (0 != SPLIT_INDEX) {                                                  \
+        EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1;                           \
+    }                                                                        \
+
+//typedef unsigned long unsigned long;
+//typedef unsigned int uint;
+
+typedef union lint16
+{
+	unsigned short usvalue;
+	short svalue;
+	unsigned char byte[2];
+} lint16;
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union lint64
+{
+	long lvalue;
+	unsigned long ulvalue;
+	unsigned char byte[8];
+} lint64;
+
+typedef union ldouble
+{
+    double value;
+    unsigned long lvalue;
+    unsigned char byte[8];
+} ldouble;
+
+typedef union lfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} lfloat;
+
+/* array meta data and compression parameters for SZ_Init_Params() */
+typedef struct sz_params
+{
+	int dataType;
+	unsigned int max_quant_intervals; //max number of quantization intervals for quantization
+	unsigned int quantization_intervals; 
+	unsigned int maxRangeRadius;
+	int sol_ID;// it's SZ or SZ_Transpose, unless the setting is PASTRI compression mode (./configure --enable-pastri)
+	int losslessCompressor;
+	int sampleDistance; //2 bytes
+	float predThreshold;  // 2 bytes
+	int szMode; //* 0 (best speed) or 1 (better compression with Zstd/Gzip) or 3 temporal-dimension based compression
+	int gzipMode; //* four options: Z_NO_COMPRESSION, or Z_BEST_SPEED, Z_BEST_COMPRESSION, Z_DEFAULT_COMPRESSION
+	int  errorBoundMode; //4bits (0.5byte), //ABS, REL, ABS_AND_REL, or ABS_OR_REL, PSNR, or PW_REL, PSNR
+	double absErrBound; //absolute error bound
+	double relBoundRatio; //value range based relative error bound ratio
+	double psnr; //PSNR
+	double normErr;
+	double pw_relBoundRatio; //point-wise relative error bound
+	int segment_size; //only used for 2D/3D data compression with pw_relBoundRatio (deprecated)
+	int pwr_type; //only used for 2D/3D data compression with pw_relBoundRatio
+	
+	int protectValueRange; //0 or 1
+	float fmin, fmax;
+	double dmin, dmax;
+	
+	int snapshotCmprStep; //perform single-snapshot-based compression if time_step == snapshotCmprStep
+	int predictionMode;
+
+	int accelerate_pw_rel_compression;
+	int plus_bits;
+	
+	int randomAccess;
+	int withRegression;
+	
+} sz_params;
+
+typedef struct sz_metadata
+{
+	int versionNumber[3]; //only used for checking the version by calling SZ_GetMetaData()
+	int isConstant; //only used for checking if the data are constant values by calling SZ_GetMetaData()
+	int isLossless; //only used for checking if the data compression was lossless, used only by calling SZ_GetMetaData()
+	int sizeType; //only used for checking whether the size type is "int" or "long" in the compression, used only by calling SZ_GetMetaData()
+	size_t dataSeriesLength; //# number of data points in the dataset
+	int defactoNBBins; //real number of quantization bins
+	struct sz_params* conf_params; //configuration parameters
+} sz_metadata;
+
+typedef struct sz_exedata
+{
+	char optQuantMode;	//opt Quantization (0: fixed ; 1: optimized)	
+	int intvCapacity; // the number of intervals for the linear-scaling quantization
+	int intvRadius;  // the number of intervals for the radius of the quantization range (intvRadius=intvCapacity/2)
+	unsigned int SZ_SIZE_TYPE; //the length (# bytes) of the size_t in the system at runtime //4 or 8: sizeof(size_t) 
+} sz_exedata;
+
+/*We use a linked list to maintain time-step meta info for time-step based compression*/
+typedef struct sz_tsc_metainfo
+{
+	int totalNumOfSteps;
+	int currentStep;
+	char metadata_filename[256];
+	FILE *metadata_file;
+	unsigned char* bit_array; //sihuan added
+	size_t intersect_size; //sihuan added
+	int64_t* hist_index; //sihuan added: prestep index 
+
+} sz_tsc_metadata;
+
+extern int versionNumber[4];
+
+//-------------------key global variables--------------
+extern int dataEndianType; //*endian type of the data read from disk
+extern int sysEndianType; //*sysEndianType is actually set automatically.
+
+extern sz_params *confparams_cpr;
+extern sz_params *confparams_dec;
+extern sz_exedata *exe_params;
+
+//------------------------------------------------
+extern SZ_VarSet* sz_varset;
+extern sz_multisteps *multisteps; //compression based on multiple time steps (time-dimension based compression)
+extern sz_tsc_metadata *sz_tsc;
+
+//for pastri 
+#ifdef PASTRI
+extern pastri_params pastri_par; 
+#endif
+
+//sz.h
+HuffmanTree* SZ_Reset();
+
+int SZ_Init(const char *configFilePath);
+
+int SZ_Init_Params(sz_params *params);
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_double_subblock(unsigned char* compressedBytes, double *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+unsigned char *SZ_compress(int dataType, void *data, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+unsigned char* SZ_compress_args(int dataType, void *data, size_t *outSize, int errBoundMode, double absErrBound, 
+double relBoundRatio, double pwrBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args2(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, 
+int errBoundMode, double absErrBound, double relBoundRatio, double pwrBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args3(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned char *SZ_compress_rev_args(int dataType, void *data, void *reservedValue, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_rev_args2(int dataType, void *data, void *reservedValue, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+unsigned char *SZ_compress_rev(int dataType, void *data, void *reservedValue, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+void SZ_Create_ParamsExe(sz_params** conf_params, sz_exedata** exe_params);
+
+void *SZ_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+size_t SZ_decompress_args(int dataType, unsigned char *bytes, size_t byteLength, void* decompressed_array, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+sz_metadata* SZ_getMetadata(unsigned char* bytes);
+void SZ_printMetadata(sz_metadata* metadata);
+
+
+void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+size_t compute_total_batch_size();
+
+void SZ_registerVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_deregisterVar_ID(int var_id);
+int SZ_deregisterVar(char* varName);
+int SZ_deregisterAllVars();
+
+int SZ_compress_ts_select_var(int cmprType, unsigned char* var_ids, unsigned char var_count, unsigned char** newByteData, size_t *outSize);
+int SZ_compress_ts(int cmprType, unsigned char** newByteData, size_t *outSize);
+void SZ_decompress_ts_select_var(unsigned char* var_ids, unsigned char var_count, unsigned char *bytes, size_t bytesLength);
+void SZ_decompress_ts(unsigned char *bytes, size_t byteLength);
+
+void SZ_Finalize();
+
+void convertSZParamsToBytes(sz_params* params, unsigned char* result);
+void convertBytesToSZParams(unsigned char* bytes, sz_params* params);
+
+unsigned char* SZ_compress_customize(const char* appName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status);
+
+unsigned char* SZ_compress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status);
+
+void* SZ_decompress_customize(const char* appName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int* status);
+
+void* SZ_decompress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int *status);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_H  ----- */
diff --git a/deps/SZ/sz/include/sz_double.h b/deps/SZ/sz/include/sz_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..1004f118502208fe736d019d5774f223571f30a0
--- /dev/null
+++ b/deps/SZ/sz/include/sz_double.h
@@ -0,0 +1,100 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_H
+#define _SZ_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_double(double realPrecision, short radExpo, int* reqLength, double* medianValue);
+short computeReqLength_double_MSST19(double realPrecision);
+
+unsigned int optimize_intervals_double_1D(double *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_double_2D(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_3D(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_4D(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision);
+
+size_t SZ_compress_double_3D_MDQ_RA_block(double * block_ori_data, double * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, double * P0, double * P1, int * type, double * unpredictable_data);
+
+unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_double_2D_opt_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_3D_opt_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+TightDataPointStorageD* SZ_compress_double_1D_MDQ(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_d);
+void SZ_compress_args_double_StoreOriData(double* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_double_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, double *oriData, size_t dataLength, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_4D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_MSST19(double *oriData, size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_f);
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_f);
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_f);
+
+void SZ_compress_args_double_withinRange(unsigned char** newByteData, double *oriData, size_t dataLength, size_t *outSize);
+
+/*int SZ_compress_args_double_wRngeNoGzip(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);*/
+
+int SZ_compress_args_double(int cmprType, int withRegression, unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+void SZ_compress_args_double_NoCkRnge_1D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+void SZ_compress_args_double_NoCkRnge_2D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_3D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_4D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_double_1D_subblock(double *oriData, double realPrecision, size_t r1, size_t s1, size_t e1);
+unsigned int optimize_intervals_double_2D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+unsigned int optimize_intervals_double_3D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+unsigned int optimize_intervals_double_4D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+unsigned int optimize_intervals_double_2D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned int optimize_intervals_double_3D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_double_pwr.h b/deps/SZ/sz/include/sz_double_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..421895adbb467b171f4315338e6addd8540a7c24
--- /dev/null
+++ b/deps/SZ/sz/include/sz_double_pwr.h
@@ -0,0 +1,57 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_PWR_H
+#define _SZ_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdbool.h>
+
+void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_double_1D_pwr(double *oriData, size_t dataLength, double* pwrErrBound); 
+void compute_segment_precisions_double_2D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_2D_pwr(double *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, double* pwrErrBound);
+void compute_segment_precisions_double_3D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_3D_pwr(double *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, double* pwrErrBound);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2,
+size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
+void createRangeGroups_double(double** posGroups, double** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_double(char* groupID, TightDataPointStorageD* tdps);
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_pwrGroup(double* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize);
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double valueRangeSize, double medianValue_f,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, double valueRangeSize,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, double valueRangeSize, 
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_PWR_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_double_ts.h b/deps/SZ/sz/include/sz_double_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..581d20ddf58ba77f61b70bd1042a352f482919ef
--- /dev/null
+++ b/deps/SZ/sz/include/sz_double_ts.h
@@ -0,0 +1,27 @@
+/**
+ *  @file sz_double_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageD.h"
+
+#ifndef _SZ_Double_TS_H
+#define _SZ_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength, double* preData, double realPrecision);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, double valueRangeSize, double medianValue_d);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_TS_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_float.h b/deps/SZ/sz/include/sz_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..9557ee0593ff4b84a9272075828954d8ff9890a4
--- /dev/null
+++ b/deps/SZ/sz/include/sz_float.h
@@ -0,0 +1,153 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "DynamicFloatArray.h"
+
+#ifndef _SZ_Float_H
+#define _SZ_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_float(double realPrecision, short radExpo, int* reqLength, float* medianValue);
+short computeReqLength_float_MSST19(double realPrecision);
+
+unsigned int optimize_intervals_float_1D(float *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_float_2D(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_4D(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_and_compute_dense_position_float_1D(float *oriData, size_t dataLength, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_and_compute_dense_position_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength, double realPrecision);
+
+unsigned int optimize_intervals_float_1D_opt_MSST19(float *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_float_2D_opt_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_3D_opt_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ(float *oriData, 
+size_t dataLength, float realPrecision, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_StoreOriData(float* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, float *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ(float *oriData, size_t r1, size_t r2, float realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+size_t SZ_compress_float_1D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+
+size_t SZ_compress_float_1D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, DynamicFloatArray * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block_2D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_adaptive(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+//unsigned short SZ_compress_float_3D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, int block_dim_0, int block_dim_1, int block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred_flush_after_compare(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_2_layers(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, float * P_, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_pred_by_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, float * unpredictable_data);
+void SZ_blocked_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float *params);
+unsigned char * SZ_compress_float_3D_MDQ_RA_all_by_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+float SZ_compress_float_3D_MDQ_RA_block_no_mean(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, unsigned short * unpred_count, float * unpredictable_data);
+float SZ_compress_float_3D_MDQ_pred_by_regression_with_err(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, unsigned short * unpred_count, float * unpredictable_data);
+unsigned char * SZ_compress_float_3D_MDQ_RA_blocked_with_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+void decompressDataSeries_float_3D_RA_blocked_with_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+unsigned char * SZ_compress_float_1D_MDQ_RA(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_RA(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_ori(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_adaptive(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+unsigned char * SZ_compress_float_2D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_1D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_4D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_MSST19(float *oriData, 
+size_t dataLength, double realPrecision, float valueRangeSize, float medianValue_f);
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision, float valueRangeSize, float medianValue_f);
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_withinRange(unsigned char** newByteData, float *oriData, size_t dataLength, size_t *outSize);
+
+/*int SZ_compress_args_float_wRngeNoGzip(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);*/
+
+int SZ_compress_args_float(int cmprType, int withRegression, unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+void SZ_compress_args_float_NoCkRnge_1D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_2D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_3D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_4D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_float_1D_subblock(float *oriData, double realPrecision, size_t r1, size_t s1, size_t e1); 
+unsigned int optimize_intervals_float_2D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+unsigned int optimize_intervals_float_3D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+unsigned int optimize_intervals_float_4D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, float realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_float_pwr.h b/deps/SZ/sz/include/sz_float_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bbda8dd479504c6603e427e410c0a07d844a53c
--- /dev/null
+++ b/deps/SZ/sz/include/sz_float_pwr.h
@@ -0,0 +1,66 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Float_PWR_H
+#define _SZ_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdbool.h>
+
+void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_float_1D_pwr(float *oriData, size_t dataLength, float* pwrErrBound);
+
+void compute_segment_precisions_float_2D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_2D_pwr(float *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, float* pwrErrBound); 
+
+void compute_segment_precisions_float_3D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_3D_pwr(float *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, float* pwrErrBound);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t dataLength, size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t r3, size_t *outSize, float min, float max);
+
+void createRangeGroups_float(float** posGroups, float** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_float(char* groupID, TightDataPointStorageF* tdps);
+int* generateGroupLowerBounds();
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_pwrGroup(float* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float valueRangeSize, float medianValue_f,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float valueRangeSize,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);																
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float valueRangeSize, 
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_PWR_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_float_ts.h b/deps/SZ/sz/include/sz_float_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2301da99ebc426c695d7196df44f320f2dd87e
--- /dev/null
+++ b/deps/SZ/sz/include/sz_float_ts.h
@@ -0,0 +1,27 @@
+/**
+ *  @file sz_float_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageF.h"
+
+#ifndef _SZ_Float_TS_H
+#define _SZ_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_float_1D_ts(float *oriData, size_t dataLength, float* preData, double realPrecision);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_ts(float *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, float valueRangeSize, float medianValue_f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_TS_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_int16.h b/deps/SZ/sz/include/sz_int16.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ad62c5b9038b621d940e6cf3926a206a648d5e3
--- /dev/null
+++ b/deps/SZ/sz/include/sz_int16.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int16_H
+#define _SZ_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int16_2D(int16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int16_3D(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int16_4D(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_StoreOriData(int16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int16_NoCkRngeNoGzip_1D(unsigned char** newByteData, int16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int16_t minValue);
+TightDataPointStorageI* SZ_compress_int16_2D_MDQ(int16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_3D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_3D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_4D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_4D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_withinRange(unsigned char** newByteData, int16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int16_wRngeNoGzip(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int16(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int16_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_int32.h b/deps/SZ/sz/include/sz_int32.h
new file mode 100644
index 0000000000000000000000000000000000000000..a87825d0fcfcaa9757581d8ff1f05ea0161ddc7e
--- /dev/null
+++ b/deps/SZ/sz/include/sz_int32.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int32_H
+#define _SZ_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int32_2D(int32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int32_3D(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int32_4D(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_StoreOriData(int32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int32_NoCkRngeNoGzip_1D(unsigned char** newByteData, int32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int32_t minValue);
+TightDataPointStorageI* SZ_compress_int32_2D_MDQ(int32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_3D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_3D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_4D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_4D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_withinRange(unsigned char** newByteData, int32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int32_wRngeNoGzip(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int32(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int32_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_int64.h b/deps/SZ/sz/include/sz_int64.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7213b2315b551385c5e0c22d3dbd16a07291746
--- /dev/null
+++ b/deps/SZ/sz/include/sz_int64.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int64_H
+#define _SZ_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int64_2D(int64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int64_3D(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int64_4D(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_StoreOriData(int64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int64_NoCkRngeNoGzip_1D(unsigned char** newByteData, int64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_2D_MDQ(int64_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_3D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_3D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_4D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_4D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_withinRange(unsigned char** newByteData, int64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int64_wRngeNoGzip(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int64(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int64_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_int8.h b/deps/SZ/sz/include/sz_int8.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ce758a79524ffc25e3803f4e03d31119a4f47d
--- /dev/null
+++ b/deps/SZ/sz/include/sz_int8.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int8_H
+#define _SZ_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int8_2D(int8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int8_3D(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int8_4D(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_StoreOriData(int8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int8_NoCkRngeNoGzip_1D(unsigned char** newByteData, int8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int8_t minValue);
+TightDataPointStorageI* SZ_compress_int8_2D_MDQ(int8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_3D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_3D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_4D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_4D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_withinRange(unsigned char** newByteData, int8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int8_wRngeNoGzip(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int8(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int8_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_omp.h b/deps/SZ/sz/include/sz_omp.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb83acbe9a897967054382fdd8698e5613289913
--- /dev/null
+++ b/deps/SZ/sz/include/sz_omp.h
@@ -0,0 +1,47 @@
+/**
+ *  @file sz_omp.h
+ *  @author Xin Liang
+ *  @date July, 2017
+ *  @brief Header file for the sz_omp.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+#include "sz.h"
+
+#ifndef _SZ_OMP_H
+#define _SZ_OMP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size);
+
+void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data);
+void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data);
+
+unsigned char * SZ_compress_double_1D_MDQ_openmp(double *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_2D_MDQ_openmp(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_openmp(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+void decompressDataSeries_double_1D_openmp(double** data, size_t r1, unsigned char* comp_data);
+void decompressDataSeries_double_2D_openmp(double** data, size_t r1, size_t r2, unsigned char* comp_data);
+void decompressDataSeries_double_3D_openmp(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+//void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num);
+void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_OMP_H  ----- */
diff --git a/deps/SZ/sz/include/sz_opencl.h b/deps/SZ/sz/include/sz_opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..693256161c0600c97341033d7c28979eb5c90b9d
--- /dev/null
+++ b/deps/SZ/sz/include/sz_opencl.h
@@ -0,0 +1,68 @@
+//make header C++/C inter-operable
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef SZ_OPENCL_H
+#define SZ_OPENCL_H
+
+#include<stddef.h>
+
+	//opaque pointer for opencl state
+  struct sz_opencl_state;
+
+  /**
+   * creates an opencl state for multiple uses of the compressor or
+   * returns an error code.
+   *
+   * \post if return code is SZ_NCES, the state object may only be passed to
+   * sz_opencl_release or sz_opencl_error_* otherwise it may be used in any
+   * sz_opencl_* function.
+   *
+   * \param[out] state the sz opencl state
+   * \return SZ_SCES for success or SZ_NCES on error
+   */
+  int sz_opencl_init(struct sz_opencl_state** state);
+
+	/**
+	 * deinitializes an opencl state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return SZ_SCES
+	 */
+  int sz_opencl_release(struct sz_opencl_state** state);
+
+	/**
+	 * returns a human readable error message for the last error recieved by state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return a pointer to a string that describes the error
+	 */
+	const char* sz_opencl_error_msg(struct sz_opencl_state* state);
+
+
+	/**
+	 * returns a numeric code for the last error recieved by state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return the numeric error code
+	 */
+  int sz_opencl_error_code(struct sz_opencl_state* state);
+
+	/**
+	 * confirms that the sz opencl state is ready to use by performing a vector addition
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return SZ_SCES if the opencl implementation is functioning
+	 */
+	int sz_opencl_check(struct sz_opencl_state*);
+
+  unsigned char* sz_compress_float3d_opencl(float* data, size_t r1, size_t r2, size_t r3, double, size_t* out_size);
+
+
+#endif /* SZ_OPENCL_H */
+
+//make header C++/C inter-operable
+#ifdef __cplusplus
+}
+#endif
diff --git a/deps/SZ/sz/include/sz_stats.h b/deps/SZ/sz/include/sz_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba0f701ae918a9ac4d0ceb4b7b34f8054c0fea7f
--- /dev/null
+++ b/deps/SZ/sz/include/sz_stats.h
@@ -0,0 +1,58 @@
+/**
+ *  @file ByteToolkit.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _STATS_H
+#define _STATS_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct sz_stats
+{
+	int use_mean;
+	
+	size_t blockSize;
+	
+	float lorenzoPercent;
+	float regressionPercent;
+	size_t lorenzoBlocks;
+	size_t regressionBlocks;
+	size_t totalBlocks;
+	
+	//size_t huffmanTreeHeight;
+	size_t huffmanTreeSize; //before the final zstd
+	size_t huffmanCodingSize; //before the final zstd
+	float huffmanCompressionRatio;
+	int huffmanNodeCount;
+		
+	size_t unpredictCount;
+	float unpredictPercent;
+	
+	float zstdCompressionRatio; //not available yet
+	
+} sz_stats;
+
+extern sz_stats sz_stat;
+
+
+void writeBlockInfo(int use_mean, size_t blockSize, size_t regressionBlocks, size_t totalBlocks);
+void writeHuffmanInfo(size_t huffmanTreeSize, size_t huffmanCodingSize, size_t totalDataSize, int huffmanNocdeCount);
+void writeZstdCompressionRatio(float zstdCompressionRatio);
+void writeUnpredictDataCounts(size_t unpredictCount, size_t totalNumElements);
+void printSZStats();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _STATS_H  ----- */
diff --git a/deps/SZ/sz/include/sz_uint16.h b/deps/SZ/sz/include/sz_uint16.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb2319772faa95eb2bef6d9db0130a3c52472229
--- /dev/null
+++ b/deps/SZ/sz/include/sz_uint16.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint16.h
+ *  @author Sheng Di
+ *  @date Nov, 2017
+ *  @brief Header file for the sz_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt16_H
+#define _SZ_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint16_2D(uint16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint16_3D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint16_4D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_StoreOriData(uint16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint16_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_2D_MDQ(uint16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_3D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_4D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_withinRange(unsigned char** newByteData, uint16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint16_wRngeNoGzip(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint16(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt16_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_uint32.h b/deps/SZ/sz/include/sz_uint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..8adb31d3fc19446fa8b71dcfb6cdc2b2ea8c9556
--- /dev/null
+++ b/deps/SZ/sz/include/sz_uint32.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt32_H
+#define _SZ_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint32_2D(uint32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint32_3D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint32_4D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_StoreOriData(uint32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint32_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_2D_MDQ(uint32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_3D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_4D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_withinRange(unsigned char** newByteData, uint32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint32_wRngeNoGzip(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint32(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt32_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_uint64.h b/deps/SZ/sz/include/sz_uint64.h
new file mode 100644
index 0000000000000000000000000000000000000000..7717aa2d5fa82d9f2415fb50af62d936b9d10bfb
--- /dev/null
+++ b/deps/SZ/sz/include/sz_uint64.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt64_H
+#define _SZ_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint64_2D(uint64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint64_3D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint64_4D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t dataLength, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_StoreOriData(uint64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_2D_MDQ(uint64_t *oriData, size_t r1, size_t r2, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_3D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_4D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_withinRange(unsigned char** newByteData, uint64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint64_wRngeNoGzip(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint64(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt64_H  ----- */
+
diff --git a/deps/SZ/sz/include/sz_uint8.h b/deps/SZ/sz/include/sz_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..9de3a117b557715fed450978e4b54b36f094e239
--- /dev/null
+++ b/deps/SZ/sz/include/sz_uint8.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt8_H
+#define _SZ_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint8_2D(uint8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint8_3D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint8_4D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_StoreOriData(uint8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint8_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_2D_MDQ(uint8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_3D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_4D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_withinRange(unsigned char** newByteData, uint8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint8_wRngeNoGzip(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint8(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt8_H  ----- */
+
diff --git a/deps/SZ/sz/include/szd_double.h b/deps/SZ/sz/include/szd_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fcf48bc6f957b8b40a98774c35f12276379d2d6
--- /dev/null
+++ b/deps/SZ/sz/include/szd_double.h
@@ -0,0 +1,43 @@
+/**
+ *  @file szd_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_H
+#define _SZD_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D(double** data, size_t r1, size_t r2, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D(double** data, size_t r1, size_t r2, size_t r3, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, double* hist_data, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data, double* hist_data);
+void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, double* hist_data);
+
+size_t decompressDataSeries_double_3D_RA_block(double * data, double mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, double * unpredictable_data);
+
+int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize, int compressionType, double* hist_data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_H  ----- */
diff --git a/deps/SZ/sz/include/szd_double_pwr.h b/deps/SZ/sz/include/szd_double_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3bffb5a2d0cf00013518839459c2bc9f544a746
--- /dev/null
+++ b/deps/SZ/sz/include/szd_double_pwr.h
@@ -0,0 +1,36 @@
+/**
+ *  @file szd_double_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_PWR_H
+#define _SZD_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_double_1D_pwr(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+double* extractRealPrecision_2D_double(size_t R1, size_t R2, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_1D_pwr_pre_log(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr_pre_log(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr_pre_log(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_pwr_pre_log_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_PWR_H  ----- */
diff --git a/deps/SZ/sz/include/szd_double_ts.h b/deps/SZ/sz/include/szd_double_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f7a768aef85ede04de1cd3b3ae2a0061b3654b9
--- /dev/null
+++ b/deps/SZ/sz/include/szd_double_ts.h
@@ -0,0 +1,25 @@
+/**
+ *  @file szd_double_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_TS_H
+#define _SZD_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D_ts(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_TS_H  ----- */
diff --git a/deps/SZ/sz/include/szd_float.h b/deps/SZ/sz/include/szd_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..d11bba4478c6264b39008845d4a7abd994f40e66
--- /dev/null
+++ b/deps/SZ/sz/include/szd_float.h
@@ -0,0 +1,58 @@
+/**
+ *  @file szd_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_H
+#define _SZD_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D(float** data, size_t r1, size_t r2, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D(float** data, size_t r1, size_t r2, size_t r3, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, float* hist_data, TightDataPointStorageF* tdps);
+
+void decompressDataSeries_float_1D_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+
+size_t decompressDataSeries_float_1D_RA_block(float * data, float mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, int * type, float * unpredictable_data);
+
+int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize, int compressionType, float* hist_data);
+
+size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+
+void decompressDataSeries_float_1D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t s1, size_t e1, unsigned char* comp_data);
+
+void decompressDataSeries_float_2D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, unsigned char* comp_data, float* hist_data);
+void decompressDataSeries_float_2D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2, unsigned char* comp_data);
+void decompressDataSeries_float_3D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, float* hist_data);
+void decompressDataSeries_float_3D_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_3D_decompression_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_3D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3, unsigned char* comp_data);
+int SZ_decompress_args_randomaccess_float(float** newData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, 
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1, // start point
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1, // end point
+unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_H  ----- */
diff --git a/deps/SZ/sz/include/szd_float_pwr.h b/deps/SZ/sz/include/szd_float_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..35249027f0116b98f3f5771473872db1a87cd9dc
--- /dev/null
+++ b/deps/SZ/sz/include/szd_float_pwr.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_float_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_PWR_H
+#define _SZD_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_float_1D_pwr(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+float* extractRealPrecision_2D_float(size_t R1, size_t R2, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+float* extractRealPrecision_3D_float(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_1D_pwr_pre_log(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr_pre_log(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr_pre_log(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+void decompressDataSeries_float_1D_pwr_pre_log_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_PWR_H  ----- */
+
diff --git a/deps/SZ/sz/include/szd_float_ts.h b/deps/SZ/sz/include/szd_float_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..88ea07f4a8c09a8e23322315ab02438c2abbeabb
--- /dev/null
+++ b/deps/SZ/sz/include/szd_float_ts.h
@@ -0,0 +1,25 @@
+/**
+ *  @file szd_float_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_TS_H
+#define _SZD_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D_ts(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_TS_H  ----- */
diff --git a/deps/SZ/sz/include/szd_int16.h b/deps/SZ/sz/include/szd_int16.h
new file mode 100644
index 0000000000000000000000000000000000000000..a55a3d0487bfb45e6efa553e2dacbcf0a89d9ae2
--- /dev/null
+++ b/deps/SZ/sz/include/szd_int16.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int16_H
+#define _SZD_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT16_MIN -32768
+#define SZ_INT16_MAX 32767
+
+void decompressDataSeries_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
diff --git a/deps/SZ/sz/include/szd_int32.h b/deps/SZ/sz/include/szd_int32.h
new file mode 100644
index 0000000000000000000000000000000000000000..233901f54e88b1d13586d2533fc16775f9d9f17a
--- /dev/null
+++ b/deps/SZ/sz/include/szd_int32.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int32_H
+#define _SZD_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT32_MIN -2147483648
+#define SZ_INT32_MAX 2147483647
+
+void decompressDataSeries_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int32_H  ----- */
diff --git a/deps/SZ/sz/include/szd_int64.h b/deps/SZ/sz/include/szd_int64.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dcb97ac9be5bf6f544f29455189cd50ca878c25
--- /dev/null
+++ b/deps/SZ/sz/include/szd_int64.h
@@ -0,0 +1,35 @@
+/**
+ *  @file szd_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int64_H
+#define _SZD_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int64_H  ----- */
diff --git a/deps/SZ/sz/include/szd_int8.h b/deps/SZ/sz/include/szd_int8.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6186f866d008fa27e2978c89b4d207cd7426a2a
--- /dev/null
+++ b/deps/SZ/sz/include/szd_int8.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int8_H
+#define _SZD_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT8_MIN -128
+#define SZ_INT8_MAX 127
+
+void decompressDataSeries_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int8_H  ----- */
diff --git a/deps/SZ/sz/include/szd_uint16.h b/deps/SZ/sz/include/szd_uint16.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcd3ed830703818dba7fa5b8b71c84ac448b205a
--- /dev/null
+++ b/deps/SZ/sz/include/szd_uint16.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt16_H
+#define _SZD_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT16_MIN 0
+#define SZ_UINT16_MAX 65535
+
+void decompressDataSeries_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
diff --git a/deps/SZ/sz/include/szd_uint32.h b/deps/SZ/sz/include/szd_uint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..88ff5708a80aea4f28bfd4819b40a39c3f42e36e
--- /dev/null
+++ b/deps/SZ/sz/include/szd_uint32.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt32_H
+#define _SZD_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT32_MIN 0
+#define SZ_UINT32_MAX 4294967295
+
+void decompressDataSeries_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt32_H  ----- */
diff --git a/deps/SZ/sz/include/szd_uint64.h b/deps/SZ/sz/include/szd_uint64.h
new file mode 100644
index 0000000000000000000000000000000000000000..6992c68ebceaaeae5be46e6a2228457e1ee85cd2
--- /dev/null
+++ b/deps/SZ/sz/include/szd_uint64.h
@@ -0,0 +1,35 @@
+/**
+ *  @file szd_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt64_H
+#define _SZD_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt64_H  ----- */
diff --git a/deps/SZ/sz/include/szd_uint8.h b/deps/SZ/sz/include/szd_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..2366c7e08e0f048c4634f7689e65d3b48fcc7bf2
--- /dev/null
+++ b/deps/SZ/sz/include/szd_uint8.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt8_H
+#define _SZD_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT8_MIN 0
+#define SZ_UINT8_MAX 255
+
+void decompressDataSeries_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt8_H  ----- */
diff --git a/deps/SZ/sz/include/szf.h b/deps/SZ/sz/include/szf.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cf2e838da3e157df1ce9ff8df6424c76b2c2ea8
--- /dev/null
+++ b/deps/SZ/sz/include/szf.h
@@ -0,0 +1,102 @@
+/**
+ *  @file szf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZF_H
+#define _SZF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//szf.c
+void sz_init_c_(char *configFile,int *len,int *ierr);
+void sz_finalize_c_();
+void SZ_writeData_inBinary_d1_Float_(float* data, char *fileName, int *len);
+void sz_compress_d1_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_decompress_d1_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1);
+void sz_decompress_d2_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_decompress_d1_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1);
+void sz_decompress_d2_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_batchaddVar_d1_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchaddvar_d1_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchdelvar_c_(char* varName, int *len, int *errState);
+void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize);
+void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr);
+void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void compute_total_batch_size_c_(size_t *totalSize);
+void sz_getvardata_float_(char* varName, int *len, float* data);
+void sz_getvardata_double_(char* varName, int *len, double* data);
+void sz_freevarset_c_(int *mode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZF_H  ----- */
+
diff --git a/deps/SZ/sz/include/utility.h b/deps/SZ/sz/include/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4ae6415890bc71139bc6300ca4c8a981c757833
--- /dev/null
+++ b/deps/SZ/sz/include/utility.h
@@ -0,0 +1,45 @@
+/**
+ *  @file utility.h
+ *  @author Sheng Di, Sihuan Li
+ *  @date July, 2018
+ *  @brief Header file for the utility.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _UTILITY_H
+#define _UTILITY_H
+
+#include "sz.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//sihuan added: use a assistant struct to do sorting and swap that are easy to implement: should
+//consider optimizing the performance later.
+typedef struct sort_ast_particle{
+	int64_t id;
+	float var[6];
+} sort_ast_particle;
+
+int compare_struct(const void* obj1, const void* obj2);//sihuan added: the compare function in the qsort parameter for 2 structures
+void reorder_vars(SZ_VarSet* vset);//sihuan added: reorder the variables increasingly by their index
+size_t intersectAndsort(int64_t* preIndex, size_t preLen, SZ_VarSet* curVar, size_t dataLen, unsigned char* bitarray);
+//sihuan added: find intersection and keep new var sorted by id
+void write_reordered_tofile(SZ_VarSet* curVar, size_t dataLen);
+//sihuan added: write the reordered input to files for further decompression validation
+float calculate_delta_t(size_t size);//sihuan added
+
+int is_lossless_compressed_data(unsigned char* compressedBytes, size_t cmpSize);
+unsigned long sz_lossless_compress(int losslessCompressor, int level, unsigned char* data, unsigned long dataLength, unsigned char** compressBytes);
+unsigned long sz_lossless_decompress(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long sz_lossless_decompress65536bytes(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+void* detransposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* transposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _UTILITY_H  ----- */
diff --git a/deps/SZ/sz/src/ArithmeticCoding.c b/deps/SZ/sz/src/ArithmeticCoding.c
new file mode 100644
index 0000000000000000000000000000000000000000..278a2e917542dffaf2ecce457b17df618c17eca8
--- /dev/null
+++ b/deps/SZ/sz/src/ArithmeticCoding.c
@@ -0,0 +1,692 @@
+/**
+ *  @file ArithmeticCoding.c
+ *  @author Sheng Di, Mark Thomas Nelson
+ *  @date April, 2016
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *  (C) The MIT License (MIT), this code was modified from Mark's arithmetic coding code: http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251?pgno=1
+ */
+#include <sz.h>
+#include <ArithmeticCoding.h>
+
+inline void output_bit_1(unsigned int* buf)
+{
+	(*buf) = (*buf) << 1;
+	(*buf) |= 1;
+}
+
+inline void output_bit_0(unsigned int* buf)
+{
+	(*buf) = (*buf) << 1;
+	//(*byte) |= 0; //actually doesn't have to set the bit to 0
+}
+
+//TODO: problematic
+inline unsigned int output_bit_1_plus_pending(int pending_bits)
+{
+	unsigned int buf = 0, pbits = pending_bits;
+	output_bit_1(&buf);
+	while(pbits--)
+		output_bit_0(&buf);
+	buf = buf << (32-(pending_bits+1)); //alignment to the left leading bit, which would be easier for the final output
+	return buf;
+}
+
+inline unsigned int output_bit_0_plus_pending(int pending_bits)
+{
+	unsigned int buf = 0, pbits = pending_bits;
+	//output_bit_0(&buf);
+	while(pbits--)
+		output_bit_1(&buf);
+	buf = buf << (32-(pending_bits+1)); //alignment to the left leading bit
+	return buf;
+}
+
+/**
+ * Create AriCoder for the following arithmetic encoding operation. 
+ * In this function, it will compute the real frequency of the integer codes.
+ * @param int numOfStates (input): numOfStates is the real # states calculated to the optimization_num_of_interval code
+ * @param int *s (input): the integer code array (i.e., type_array generated by prediction+quantization)
+ * @param size_t length: the number of integer codes in the type_array
+ * 
+ * */
+AriCoder *createAriCoder(int numOfStates, int *s, size_t length)
+{
+	AriCoder *ariCoder = (AriCoder*)malloc(sizeof(AriCoder));
+	memset(ariCoder, 0, sizeof(AriCoder));
+	ariCoder->numOfRealStates = numOfStates;
+	ari_init(ariCoder, s, length);
+    return ariCoder;
+}
+
+void freeAriCoder(AriCoder *ariCoder)
+{
+	free(ariCoder->cumulative_frequency);
+	free(ariCoder);
+}
+
+void ari_init(AriCoder *ariCoder, int *s, size_t length)
+{
+	size_t i; //# states is in the range of integer.
+	int index = 0;
+	size_t *freq = (size_t *)malloc(ariCoder->numOfRealStates*sizeof(size_t));
+	memset(freq, 0, ariCoder->numOfRealStates*sizeof(size_t));
+	for(i = 0;i < length;i++) 
+	{
+		index = s[i];
+		freq[index]++;
+	}
+ 
+	int counter = 0;
+	size_t _sum = 0, sum = 0, freqDiv = 0;
+	ariCoder->cumulative_frequency = (Prob *)malloc(ariCoder->numOfRealStates*sizeof(Prob));
+	
+	memset(ariCoder->cumulative_frequency, 0, ariCoder->numOfRealStates*sizeof(Prob));
+	
+	if(length <= MAX_INTERVALS)
+	{
+		for (index = 0; index < ariCoder->numOfRealStates; index++)
+		{
+			if (freq[index]) 
+			{
+				sum += freq[index];
+				(ariCoder->cumulative_frequency[index]).low = _sum;
+				(ariCoder->cumulative_frequency[index]).high = sum;
+				(ariCoder->cumulative_frequency[index]).state = index;
+				_sum = sum;
+				counter++;
+			}
+		}
+		ariCoder->numOfValidStates = counter;
+		ariCoder->total_frequency = sum;		
+	}
+	else
+	{
+		int intvSize = length%MAX_INTERVALS==0?length/MAX_INTERVALS:length/MAX_INTERVALS+1;
+		for (index = 0; index < ariCoder->numOfRealStates; index++)
+		{
+			if (freq[index]) 
+			{
+				freqDiv = freq[index]/intvSize; //control the sum of frequency to be no greater than MAX_INTERVALS
+				if(freqDiv==0)
+					freqDiv = 1;
+				sum += freqDiv;
+				(ariCoder->cumulative_frequency[index]).low = _sum;
+				(ariCoder->cumulative_frequency[index]).high = sum;
+				(ariCoder->cumulative_frequency[index]).state = index;
+				_sum = sum;
+				counter++;
+			}
+		}
+		ariCoder->numOfValidStates = counter;
+		ariCoder->total_frequency = sum;			
+	}
+
+	free(freq);
+}
+
+/**
+ * Convert AriCoder to bytes for storage
+ * @param AriCoder* ariCoder (input)
+ * @param unsigned char** out (output)
+ * 
+ * @return outSize
+ * */
+unsigned int pad_ariCoder(AriCoder* ariCoder, unsigned char** out)
+{
+	int numOfRealStates = ariCoder->numOfRealStates;
+	int numOfValidStates = ariCoder->numOfValidStates;
+	uint64_t total_frequency = ariCoder->total_frequency;
+	Prob* cumulative_frequency = ariCoder->cumulative_frequency;
+	
+	unsigned int outSize = 0;
+	*out = (unsigned char*)malloc(2*sizeof(int)+sizeof(uint64_t)+sizeof(Prob)*numOfRealStates);
+	
+	unsigned char* p = *out;
+	intToBytes_bigEndian(p, numOfRealStates);
+	p+=sizeof(int);
+	intToBytes_bigEndian(p, numOfValidStates);
+	p+=sizeof(int);
+	int64ToBytes_bigEndian(p, total_frequency);
+	p+=sizeof(uint64_t);
+	size_t i = 0;
+	if(total_frequency <= 65536)
+	{
+		uint16_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0) //if this state cell is not null
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;
+					//if(((unsigned char)cumulative_frequency[i].state)==129)
+					//	printf("break i=%zu\n", i);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*5; //2*sizeof(uint16_t)+1
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);
+				}
+			}	
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*6;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*8;
+		}
+	}
+	else if(total_frequency <=4294967296)
+	{
+		uint32_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*9;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);
+					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*10;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*12;
+		}
+	}
+	else
+	{
+		uint64_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*17;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*18;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*20;
+		}
+	}
+	return outSize;
+}
+
+/**
+ * Reconstruct AriCoder based on the bytes loaded from compressed data
+ * @param AriCoder** ariCoder (ourput)
+ * @param unsigned char* bytes (input)
+ * 
+ * @return offset
+ * */
+int unpad_ariCoder(AriCoder** ariCoder, unsigned char* bytes)
+{
+	int offset = 0;
+	
+	*ariCoder = (AriCoder*)malloc(sizeof(AriCoder));
+	memset(*ariCoder, 0, sizeof(AriCoder));
+	
+	unsigned char *p = bytes;
+	int numOfRealStates = (*ariCoder)->numOfRealStates = bytesToInt_bigEndian(p);
+	p += sizeof(int);
+	int numOfValidStates = (*ariCoder)->numOfValidStates = bytesToInt_bigEndian(p);
+	p += sizeof(int);
+	size_t total_frequency = (*ariCoder)->total_frequency = bytesToInt64_bigEndian(p);
+	p += sizeof(uint64_t);
+	
+	(*ariCoder)->cumulative_frequency = (Prob*)malloc((*ariCoder)->numOfRealStates*sizeof(Prob));
+	memset((*ariCoder)->cumulative_frequency, 0, (*ariCoder)->numOfRealStates*sizeof(Prob));
+	size_t i = 0;
+	unsigned char *low_p = NULL, *high_p = NULL, *state_p = NULL;
+	int state = 0;
+	if(total_frequency <= 65536)
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = *state_p;
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*5; //2*sizeof(uint16_t)+1
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*6;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*8;
+		}
+	}
+	else if(total_frequency <=4294967296)
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = *state_p;
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*9;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*10;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*12;
+		}
+	}
+	else
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = *state_p;
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*17;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*18;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*20;
+		}
+	}
+	return offset;
+}
+
+/**
+ * Arithmetic Encoding
+ * @param AriCoder *ariCoder (input)
+ * @param int *s (input)
+ * @param size_t length (input)
+ * @param unsigned char *out (output)
+ * @param size_t *outSize (output)
+ * 
+ * */
+void ari_encode(AriCoder *ariCoder, int *s, size_t length, unsigned char *out, size_t *outSize)
+{
+	int pending_bits = 0;
+	size_t low = 0;
+	size_t high = MAX_CODE;
+	size_t i = 0, range = 0;
+	size_t count = ariCoder->total_frequency;
+	int c = 0, lackBits = 0;
+	*outSize = 0;
+	
+	unsigned char *outp = out;
+	
+	Prob *cumulative_frequency = ariCoder->cumulative_frequency;
+	unsigned int buf = 0;
+	
+	for (i=0;i<length;i++)
+	{
+		c = s[i];
+		Prob p = cumulative_frequency[c];
+		range = high - low + 1;
+		high = low + (range * p.high / count) - 1;
+		low = low + (range * p.low / count);
+		for ( ; ; ) 
+		{
+			if ( high < ONE_HALF )
+			{
+				buf = output_bit_0_plus_pending(pending_bits);
+				put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+				pending_bits = 0;
+			}
+			else if ( low >= ONE_HALF )
+			{
+				buf = output_bit_1_plus_pending(pending_bits);
+				put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+				pending_bits = 0;			
+			}
+			else if ( low >= ONE_FOURTH && high < THREE_FOURTHS ) 
+			{
+				pending_bits++;
+				low -= ONE_FOURTH;
+				high -= ONE_FOURTH;
+			} else
+				break;
+			high <<= 1;
+			high++;
+			low <<= 1;
+			high &= MAX_CODE;
+			low &= MAX_CODE;
+		}
+	}
+	pending_bits++;
+	if(low < ONE_FOURTH)
+	{
+		buf = output_bit_0_plus_pending(pending_bits);
+		put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+	}
+	else
+	{
+		buf = output_bit_1_plus_pending(pending_bits);
+		put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+	}	
+}
+
+/**
+ * Get the integer code based on Arithmetic Coding Value 
+ * @param AriCoder *ariCoder (input)
+ * @param size_t scaled_value (input)
+ * 
+ * @return Prob* (output)
+ * 
+ * */
+Prob* getCode(AriCoder *ariCoder, size_t scaled_value)
+{
+	int numOfRealStates = ariCoder->numOfRealStates;
+	int i = 0;
+	Prob *p = ariCoder->cumulative_frequency;
+	for(i=0;i<numOfRealStates;i++,p++)
+	{
+		if(scaled_value < p->high)
+			break;
+	}
+	return p;
+}
+
+/**
+ * Get one bit from the input stream of bytes
+ * @param unsigned char* p (input): the current location to be read (byte) of the byte stream
+ * @param int offset (input): the offset of the specified byte in the byte stream
+ * 
+ * @return unsigned char (output) : 1 or 0
+ * */
+inline unsigned char get_bit(unsigned char* p, int offset)
+{
+	return ((*p) >> (7-offset)) & 0x01;
+}
+
+/**
+ * Arithmetic Decoding algorithm 
+ * @param AriCoder *ariCoder (input): the encoder with the constructed frequency information
+ * @param unsigned char *s (input): the compressed stream of bytes
+ * @param size_t s_len (input): the number of bytes in the 'unsigned char *s'
+ * @param size_t targetLength (input): the target number of elements in the type array
+ * @param int *out (output) : the result (type array decompressed from the stream 's')
+ * 
+ * */
+void ari_decode(AriCoder *ariCoder, unsigned char *s, size_t s_len, size_t targetLength, int *out)
+{
+	size_t high = MAX_CODE;
+	size_t low = 0, i = 0;
+	size_t range = 0, scaled_value = 0;
+	size_t total_frequency = ariCoder->total_frequency;
+	unsigned char *sp = s+5;
+	unsigned int offset = 4;
+	size_t value = (bytesToUInt64_bigEndian(s) >> 20); //alignment with the MAX_CODE
+	size_t s_counter = sizeof(int);
+	
+	for(i=0;i<targetLength;i++)
+	{
+		range = high -  low + 1;
+		scaled_value = ((value - low + 1) * ariCoder->total_frequency  - 1 ) / range;
+		Prob *p = getCode(ariCoder, scaled_value);
+		out[i] = p->state;  //output the state to the 'out' array
+		high = low + (range*p->high)/total_frequency -1;
+		low = low + (range*p->low)/total_frequency;
+		
+		for( ; ; )
+		{
+			if (high < ONE_HALF) {
+			  //do nothing, bit is a zero
+			} else if ( low >= ONE_HALF ) 
+			{
+			  value -= ONE_HALF;  //subtract one half from all three code values
+			  low -= ONE_HALF;
+			  high -= ONE_HALF;
+			} else if ( low >= ONE_FOURTH && high < THREE_FOURTHS ) 
+			{
+			  value -= ONE_FOURTH;
+			  low -= ONE_FOURTH;
+			  high -= ONE_FOURTH;
+			} else
+			  break;
+			low <<= 1;
+			high <<= 1;
+			high++;
+			value <<= 1;
+			//load one bit from the input byte stream	
+			if(s_counter < s_len)
+			{	
+				value += get_bit(sp, offset++);
+				if(offset==8)
+				{
+					sp++;
+					s_counter++;
+					offset = 0;
+				}
+			}
+		}
+	}
+}
diff --git a/deps/SZ/sz/src/ByteToolkit.c b/deps/SZ/sz/src/ByteToolkit.c
new file mode 100644
index 0000000000000000000000000000000000000000..c3d7ad1eb5ccfcabc6861a0a7e04d48d0098108e
--- /dev/null
+++ b/deps/SZ/sz/src/ByteToolkit.c
@@ -0,0 +1,1063 @@
+/**
+ *  @file ByteToolkit.c
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+ 
+#include <stdlib.h>
+#include "sz.h" 	
+#include "zlib.h"
+
+inline unsigned short bytesToUInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	unsigned short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline unsigned int bytesToUInt32_bigEndian(unsigned char* bytes)
+{
+	unsigned int temp = 0;
+	unsigned int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline unsigned long bytesToUInt64_bigEndian(unsigned char* b) {
+	unsigned long temp = 0;
+	unsigned long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+	
+inline short bytesToInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline int bytesToInt32_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline long bytesToInt64_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline int bytesToInt_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+/**
+ * @unsigned char *b the variable to store the converted bytes (length=4)
+ * @unsigned int num
+ * */
+inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);	
+	
+	//note: num >> xxx already considered endian_type...
+//if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
+}
+
+inline void int64ToBytes_bigEndian(unsigned char *b, uint64_t num)
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+}
+
+inline void int32ToBytes_bigEndian(unsigned char *b, uint32_t num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);		
+}
+
+inline void int16ToBytes_bigEndian(unsigned char *b, uint16_t num)
+{
+	b[0] = (unsigned char)(num >> 8);	
+	b[1] = (unsigned char)(num);
+}
+
+/**
+ * @endianType: refers to the endian_type of unsigned char* b.
+ * */
+inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+
+inline long doubleToOSEndianLong(double value)
+{
+	ldouble buf;
+	buf.value = value;
+	return buf.lvalue;
+}
+
+inline int floatToOSEndianInt(float value)
+{
+	lfloat buf;
+	buf.value = value;
+	return buf.ivalue;
+}
+
+//TODO: debug: lfBuf.lvalue could be actually little_endian....
+inline short getExponent_float(float value)
+{
+	//int ivalue = floatToBigEndianInt(value);
+
+	lfloat lbuf;
+	lbuf.value = value;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_float(float precision)
+{
+	lfloat lbuf;
+	lbuf.value = precision;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+//	unsigned char the1stManBit = (unsigned char)((ivalue & 0x00400000) >> 22);
+//	if(the1stManBit==1)
+//		expValue--;	
+	return (short)expValue;
+}
+
+inline short getExponent_double(double value)
+{
+	//long lvalue = doubleToBigEndianLong(value);
+	
+	ldouble lbuf;
+	lbuf.value = value;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_double(double precision)
+{
+	ldouble lbuf;
+	lbuf.value = precision;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+//	unsigned char the1stManBit = (unsigned char)((lvalue & 0x0008000000000000) >> 51);
+//	if(the1stManBit==1)
+//		expValue--;
+	return (short)expValue;
+}
+
+unsigned char numberOfLeadingZeros_Int(int i) {
+	if (i == 0)
+		return 32;
+	unsigned char n = 1;
+	if (((unsigned int)i) >> 16 == 0) { n += 16; i <<= 16; }
+	if (((unsigned int)i) >> 24 == 0) { n +=  8; i <<=  8; }
+	if (((unsigned int)i) >> 28 == 0) { n +=  4; i <<=  4; }
+	if (((unsigned int)i) >> 30 == 0) { n +=  2; i <<=  2; }
+	n -= ((unsigned int)i) >> 31;
+	return n;
+}
+
+unsigned char numberOfLeadingZeros_Long(long i) {
+	 if (i == 0)
+		return 64;
+	unsigned char n = 1;
+	int x = (int)(((unsigned long)i) >> 32);
+	if (x == 0) { n += 32; x = (int)i; }
+	if (((unsigned int)x) >> 16 == 0) { n += 16; x <<= 16; }
+	if (((unsigned int)x) >> 24 == 0) { n +=  8; x <<=  8; }
+	if (((unsigned int)x) >> 28 == 0) { n +=  4; x <<=  4; }
+	if (((unsigned int)x) >> 30 == 0) { n +=  2; x <<=  2; }
+	n -= ((unsigned int)x) >> 31;
+	return n;
+}
+
+unsigned char getLeadingNumbers_Int(int v1, int v2)
+{
+	int v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Int(v);
+}
+
+unsigned char getLeadingNumbers_Long(long v1, long v2)
+{
+	long v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Long(v);
+}
+
+/**
+ * By default, the endian type is OS endian type.
+ * */
+short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+void shortToBytes(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+int bytesToInt(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	return buf.ivalue;
+}
+
+long bytesToLong(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	return buf.lvalue;
+}
+
+//the byte to input is in the big-endian format
+inline float bytesToFloat(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(buf.byte);	
+	return buf.value;
+}
+
+inline void floatToBytes(unsigned char *b, float num)
+{
+	lfloat buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(b);		
+}
+
+//the byte to input is in the big-endian format
+inline double bytesToDouble(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(buf.byte);
+	return buf.value;
+}
+
+inline void doubleToBytes(unsigned char *b, double num)
+{
+	ldouble buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(b);
+}
+
+int extractBytes(unsigned char* byteArray, size_t k, int validLength)
+{
+	size_t outIndex = k/8;
+	int innerIndex = k%8;
+	unsigned char intBytes[4];
+	int length = innerIndex + validLength;
+	int byteNum = 0;
+	if(length%8==0)
+		byteNum = length/8;
+	else
+		byteNum = length/8+1;
+	
+	int i;
+	for(i = 0;i<byteNum;i++)
+		intBytes[exe_params->SZ_SIZE_TYPE-byteNum+i] = byteArray[outIndex+i];
+	int result = bytesToInt_bigEndian(intBytes);
+	int rightMovSteps = innerIndex +(8 - (innerIndex+validLength)%8)%8;
+	result = result << innerIndex;
+	switch(byteNum)
+	{
+	case 1:
+		result = result & 0xff;
+		break;
+	case 2:
+		result = result & 0xffff;
+		break;
+	case 3:
+		result = result & 0xffffff;
+		break;
+	case 4:
+		break;
+	default: 
+		printf("Error: other cases are impossible...\n");
+		exit(0);
+	}
+	result = result >> rightMovSteps;
+	
+	return result;
+}
+
+inline int getMaskRightCode(int m) {
+	switch (m) {
+	case 1:
+		return 0x01;
+	case 2:
+		return 0x03;
+	case 3:
+		return 0x07;
+	case 4:
+		return 0x0F;
+	case 5:
+		return 0x1F;
+	case 6:
+		return 0x3F;
+	case 7:
+		return 0X7F;
+	case 8:
+		return 0XFF;
+	default:
+		return 0;
+	}
+}
+
+inline int getLeftMovingCode(int kMod8)
+{
+	return getMaskRightCode(8 - kMod8);
+}
+
+inline int getRightMovingSteps(int kMod8, int resiBitLength) {
+	return 8 - kMod8 - resiBitLength;
+}
+
+inline int getRightMovingCode(int kMod8, int resiBitLength)
+{
+	int rightMovingSteps = 8 - kMod8 - resiBitLength;
+	if(rightMovingSteps < 0)
+	{
+		switch(-rightMovingSteps)
+		{
+		case 1:
+			return 0x80;
+		case 2:
+			return 0xC0;
+		case 3:
+			return 0xE0;
+		case 4:
+			return 0xF0;
+		case 5:
+			return 0xF8;
+		case 6:
+			return 0xFC;
+		case 7:
+			return 0XFE;
+		default:
+			return 0;
+		}    		
+	}
+	else //if(rightMovingSteps >= 0)
+	{
+		int a = getMaskRightCode(8 - kMod8);
+		int b = getMaskRightCode(8 - kMod8 - resiBitLength);
+		int c = a - b;
+		return c;
+	}
+}
+
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	short* states = (short*)malloc(stateLength*sizeof(short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.svalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.svalue;
+		}		
+	}
+	return states;
+} 
+
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	unsigned short* states = (unsigned short*)malloc(stateLength*sizeof(unsigned short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.usvalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.usvalue;
+		}		
+	}
+	return states;
+} 
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];	
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];			
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+
+inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = 0;
+	if(exe_params->SZ_SIZE_TYPE==4)	
+		result = bytesToInt_bigEndian(bytes);//4		
+	else
+		result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+inline void sizeToBytes(unsigned char* outBytes, size_t size)
+{
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(outBytes, size);//4
+	else
+		longToBytes_bigEndian(outBytes, size);//8
+}
+
+/**
+ * put 'buf_nbBits' bits represented by buf into a long byte stream (the current output byte pointer is p, where offset is the number of bits already filled out for this byte so far)
+ * */
+void put_codes_to_output(unsigned int buf, int bitSize, unsigned char** p, int* lackBits, size_t *outSize)
+{
+	int byteSize, byteSizep;
+	if(*lackBits == 0)
+	{
+		byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1; //it's equal to the number of bytes involved (for *outSize)
+		byteSizep = bitSize >> 3; //it's used to move the pointer p for next data
+		intToBytes_bigEndian(*p, buf);
+		(*p) += byteSizep;
+		*outSize += byteSize;
+		(*lackBits) = bitSize%8==0 ? 0 : 8 - bitSize%8;
+	}
+	else
+	{
+		**p = (**p) | (unsigned char)(buf >> (32 - *lackBits));
+		if((*lackBits) < bitSize)
+		{
+			(*p)++;
+			int newCode = buf << (*lackBits);
+			intToBytes_bigEndian(*p, newCode);
+			bitSize -= *lackBits;
+			byteSizep = bitSize >> 3; // =bitSize/8
+			byteSize = bitSize%8==0 ? byteSizep : byteSizep+1;
+			*p += byteSizep;
+			(*outSize)+=byteSize;
+			(*lackBits) = bitSize%8==0 ? 0 : 8 - bitSize%8;
+		}
+		else
+		{
+			(*lackBits) -= bitSize;
+			if(*lackBits==0)
+				(*p)++;
+		}
+	}
+}
+
+void convertSZParamsToBytes(sz_params* params, unsigned char* result)
+{
+	//unsigned char* result = (unsigned char*)malloc(16);
+	unsigned char buf = 0;
+	//flag1: exe_params->optQuantMode(1bit), dataEndianType(1bit), sysEndianType(1bit), conf_params->szMode (1bit), conf_params->gzipMode (2bits), pwrType (2bits)
+	buf = exe_params->optQuantMode;
+	buf = (buf << 1) | dataEndianType;
+	buf = (buf << 1) | sysEndianType;
+	buf = (buf << 2) | params->szMode;
+	
+	int tmp = 0;
+	switch(params->gzipMode)
+	{
+	case Z_BEST_SPEED:
+		tmp = 0;
+		break;
+	case Z_DEFAULT_STRATEGY:
+		tmp = 1;
+		break;
+	case Z_BEST_COMPRESSION:
+		tmp = 2;
+		break;
+	}
+	buf = (buf << 2) | tmp;
+	//buf = (buf << 2) |  params->pwr_type; //deprecated
+	result[0] = buf;
+	
+    //sampleDistance; //2 bytes
+    int16ToBytes_bigEndian(&result[1], params->sampleDistance);
+    
+    //conf_params->predThreshold;  // 2 bytes
+    short tmp2 = params->predThreshold * 10000;
+    int16ToBytes_bigEndian(&result[3], tmp2);
+     
+    //errorBoundMode; //4bits(0.5 byte)
+    result[5] = params->errorBoundMode;
+    
+    //data type (float, double, int8, int16, ....) //10 choices, so 4 bits
+    result[5] = (result[5] << 4) | (params->dataType & 0x17);
+     
+    //result[5]: abs_err_bound or psnr //4 bytes
+    //result[9]: rel_bound_ratio or pwr_err_bound//4 bytes 
+    switch(params->errorBoundMode)
+    {
+	case ABS:
+		floatToBytes(&result[6], (float)(params->absErrBound)); //big_endian
+		memset(&result[10], 0, 4);
+		break;
+	case REL:
+		memset(&result[6], 0, 4);
+		floatToBytes(&result[10], (float)(params->relBoundRatio)); //big_endian
+		break;
+	case ABS_AND_REL:
+	case ABS_OR_REL:
+		floatToBytes(&result[6], (float)(params->absErrBound));
+		floatToBytes(&result[10], (float)(params->relBoundRatio)); //big_endian
+		break;
+	case PSNR:
+		floatToBytes(&result[6], (float)(params->psnr));
+		memset(&result[9], 0, 4);
+		break;
+	case ABS_AND_PW_REL:
+	case ABS_OR_PW_REL:
+		floatToBytes(&result[6], (float)(params->absErrBound));
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian	
+		break;
+	case REL_AND_PW_REL:
+	case REL_OR_PW_REL:
+		floatToBytes(&result[6], (float)(params->relBoundRatio));
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian	
+		break;
+	case PW_REL:
+		memset(&result[6], 0, 4);
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian
+		break;		
+	}
+   
+    //compressor
+    result[14] = (unsigned char)params->sol_ID;
+    
+    //int16ToBytes_bigEndian(&result[14], (short)(params->segment_size));
+    
+    if(exe_params->optQuantMode==1)
+		int32ToBytes_bigEndian(&result[16], params->max_quant_intervals);
+	else
+		int32ToBytes_bigEndian(&result[16], params->quantization_intervals);
+	
+	if(params->dataType==SZ_FLOAT)
+	{
+		floatToBytes(&result[20], params->fmin);
+		floatToBytes(&result[24], params->fmax);		
+	}
+	else
+	{
+		doubleToBytes(&result[20], params->dmin);
+		doubleToBytes(&result[28], params->dmax);		
+	}
+
+}
+
+void convertBytesToSZParams(unsigned char* bytes, sz_params* params)
+{
+	unsigned char flag1 = bytes[0];
+	exe_params->optQuantMode = (flag1 & 0x40) >> 6;
+	dataEndianType = (flag1 & 0x20) >> 5;
+	//sysEndianType = (flag1 & 0x10) >> 4;
+	
+	params->szMode = (flag1 & 0x0c) >> 2;
+	
+	int tmp = (flag1 & 0x03);
+	switch(tmp)
+	{
+	case 0:
+		params->gzipMode = Z_BEST_SPEED;
+		break;
+	case 1:
+		params->gzipMode = Z_DEFAULT_STRATEGY;
+		break;
+	case 2:
+		params->gzipMode = Z_BEST_COMPRESSION;
+		break;
+	}
+	
+	//params->pwr_type = (flag1 & 0x03) >> 0;
+
+	params->sampleDistance = bytesToInt16_bigEndian(&bytes[1]);
+	
+	params->predThreshold = 1.0*bytesToInt16_bigEndian(&bytes[3])/10000.0;
+    
+    params->dataType = bytes[5] & 0x07;
+
+	params->errorBoundMode = (bytes[5] & 0xf0) >> 4;
+
+    switch(params->errorBoundMode)
+    {
+	case ABS:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		break;
+	case REL:
+		params->relBoundRatio = bytesToFloat(&bytes[10]);
+		break;
+	case ABS_AND_REL:
+	case ABS_OR_REL:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		params->relBoundRatio = bytesToFloat(&bytes[10]);
+		break;
+	case PSNR:
+		params->psnr = bytesToFloat(&bytes[6]);
+		break;
+	case ABS_AND_PW_REL:
+	case ABS_OR_PW_REL:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);	
+		break;
+	case REL_AND_PW_REL:
+	case REL_OR_PW_REL:
+		params->relBoundRatio = bytesToFloat(&bytes[6]);
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);	
+		break;
+	case PW_REL:
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);		
+	}
+	
+    //segment_size  // 2 bytes
+    //params->segment_size = bytesToInt16_bigEndian(&bytes[14]);	
+    params->sol_ID = (int)(bytes[14]);
+    
+    if(exe_params->optQuantMode==1)
+    {
+		params->max_quant_intervals = bytesToInt32_bigEndian(&bytes[16]);
+		params->quantization_intervals = 0;
+	}
+	else
+	{
+		params->max_quant_intervals = 0;
+		params->quantization_intervals = bytesToInt32_bigEndian(&bytes[16]);  
+	}
+	
+	if(params->dataType==SZ_FLOAT)
+	{
+		params->fmin = bytesToFloat(&bytes[20]);
+		params->fmax = bytesToFloat(&bytes[24]);		
+	}
+	else if(params->dataType==SZ_DOUBLE)
+	{
+		params->dmin = bytesToDouble(&bytes[20]);
+		params->dmax = bytesToDouble(&bytes[28]);				
+	}
+
+}
diff --git a/deps/SZ/sz/src/CacheTable.c b/deps/SZ/sz/src/CacheTable.c
new file mode 100644
index 0000000000000000000000000000000000000000..296be0f8face9767fd1848051a96da688669d6fa
--- /dev/null
+++ b/deps/SZ/sz/src/CacheTable.c
@@ -0,0 +1,100 @@
+/**
+ *  @file CacheTable.c
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Cache Table
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h>
+#include "CacheTable.h"
+
+double* g_CacheTable;
+uint32_t * g_InverseTable;
+uint32_t baseIndex;
+uint32_t topIndex;
+int bits;
+
+inline int doubleGetExpo(double d){
+    long* ptr = (long*)&d;
+    *ptr = ((*ptr) >> 52) - 1023;
+    return *ptr;
+}
+
+int CacheTableGetRequiredBits(double precision, int quantization_intervals){
+    double min_distance = pow((1+precision), -(quantization_intervals>>1)) * precision;
+    return -(doubleGetExpo(min_distance));
+}
+
+inline uint32_t CacheTableGetIndex(float value, int bits){
+    uint32_t* ptr = (uint32_t*)&value;
+    int shift = 32 - 9 - bits;
+    if(shift>0){
+        return (*ptr) >> shift;
+    }else{
+        return 0;
+    }
+}
+
+inline uint64_t CacheTableGetIndexDouble(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    int shift = 64 - 12 - bits;
+    if(shift>0){
+        return (*ptr) >> shift;
+    }else{
+        return 0;
+    }
+}
+
+inline int CacheTableIsInBoundary(uint32_t index){
+    if(index <= topIndex && index > baseIndex){
+        return 1;
+    }else{
+        return 0;
+    }
+}
+
+void CacheTableBuild(double * table, int count, double smallest, double largest, double precision, int quantization_intervals){
+    bits = CacheTableGetRequiredBits(precision, quantization_intervals);
+    baseIndex = CacheTableGetIndex((float)smallest, bits)+1;
+    topIndex = CacheTableGetIndex((float)largest, bits);
+    uint32_t range = topIndex - baseIndex + 1;
+    g_InverseTable = (uint32_t *)malloc(sizeof(uint32_t) * range);
+
+    /*
+    uint32_t fillInPos = 0;
+    for(int i=0; i<count; i++){
+        if(i == 0){
+            continue;
+        }
+        uint32_t index = CacheTableGetIndex((float)table[i], bits) - baseIndex;
+        g_InverseTable[index] = i;
+        if(index > fillInPos){
+            for(int j=fillInPos; j<index; j++){
+                g_InverseTable[j] = g_InverseTable[index];
+            }
+        }
+        fillInPos = index + 1;
+    }
+     */
+    for(int i=count-1; i>0; i--){
+        uint32_t upperIndex = CacheTableGetIndex((float)table[i]*(1+precision), bits);
+        uint32_t lowerIndex = CacheTableGetIndex((float)table[i]/(1+precision), bits);
+        for(uint32_t j = lowerIndex; j<=upperIndex; j++){
+            if(j<baseIndex || j >topIndex){
+                continue;
+            }
+            g_InverseTable[j-baseIndex] = i;
+        }
+    }
+
+}
+
+inline uint32_t CacheTableFind(uint32_t index){
+    return g_InverseTable[index-baseIndex];
+}
+
+void CacheTableFree(){
+    free(g_InverseTable);
+}
diff --git a/deps/SZ/sz/src/CompressElement.c b/deps/SZ/sz/src/CompressElement.c
new file mode 100644
index 0000000000000000000000000000000000000000..0937300c1d54a8215aea3b5585ea102fbf6f3bfc
--- /dev/null
+++ b/deps/SZ/sz/src/CompressElement.c
@@ -0,0 +1,255 @@
+/**
+ *  @file CompressElement.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Functions of CompressElement
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <math.h>
+#include <sz.h>
+#include <CompressElement.h>
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength)
+{
+	HuffmanTree* huffmanTree = SZ_Reset(); //create a default huffman tree	
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+	decode_withTree(huffmanTree, bytes, dataLength, standGroupID);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	char* groupID = (char*)malloc(dataLength*sizeof(char));
+	size_t i = 0;
+	int lastGroupIDValue = 0, curStandIDValue = 0, curGroupIDValue = 0;
+	int offset = 2*(GROUP_COUNT + 2);
+	
+	curGroupIDValue = groupID[0] = standGroupID[0] - GROUP_COUNT;
+	lastGroupIDValue = curGroupIDValue;
+	for(i=1;i<dataLength;i++)
+	{
+		curStandIDValue = standGroupID[i];
+		curGroupIDValue = curStandIDValue + lastGroupIDValue - offset;
+		lastGroupIDValue = curGroupIDValue;
+		groupID[i] = curGroupIDValue;
+	}
+	free(standGroupID);
+	
+	return groupID;
+}
+
+inline short computeGroupNum_float(float value)
+{
+	short expo = getExponent_float(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+inline short computeGroupNum_double(double value)
+{
+	short expo = getExponent_double(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+/**
+ * Add preceding neighbor values to a buffer.
+ * @param  last3CmprsData buffer
+ * @param  value the value to be added to the buffer
+ * */
+inline void listAdd_double(double last3CmprsData[3], double value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float(float last3CmprsData[3], float value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_int(int64_t last3CmprsData[3], int64_t value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_int32(int32_t last3CmprsData[3], int32_t value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+inline void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+/**
+ * Determine whether the prediction value minErr is valid.
+ * 
+ * */
+inline int validPrediction_double(double minErr, double precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+inline int validPrediction_float(float minErr, float precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound)
+{
+	double pwrError;
+	double* result = (double*)malloc(GROUP_COUNT*sizeof(double));
+	int i = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		pwrError = ((double)pow(2, i))*pwrErrBound;
+		switch(errorBoundMode)
+		{
+		case ABS_AND_PW_REL:
+		case REL_AND_PW_REL: 
+			result[i] = pwrError<realPrecision?pwrError:realPrecision;
+			break;
+		case ABS_OR_PW_REL:
+		case REL_OR_PW_REL:
+			result[i] = pwrError<realPrecision?realPrecision:pwrError;
+			break;
+		case PW_REL:
+			result[i] = pwrError;
+			break;
+		}
+		
+	}
+	return result;
+}
+
+int generateGroupMaxIntervalCount(double* groupErrBounds)
+{
+	int i = 0;
+	int maxCount = 0, count = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		count = (int)(pow(2, i)/groupErrBounds[i] + 0.5);
+		if(maxCount<count)
+			maxCount = count;
+	}
+	
+	return maxCount;
+}
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+int intMidBytes_Length, int resiMidBitsLength, int resiBits)
+{
+	lce->leadingZeroBytes = leadingNum; //0,1,2,or 3
+	memcpy(lce->integerMidBytes,intMidBytes,intMidBytes_Length);
+	lce->integerMidBytes_Length = intMidBytes_Length; //they are mid_bits actually
+	lce->resMidBitsLength = resiMidBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_double(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+inline void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_float(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+#pragma GCC diagnostic pop
diff --git a/deps/SZ/sz/src/DynamicByteArray.c b/deps/SZ/sz/src/DynamicByteArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..64b7d5c7b4a59bd7682b912ce0eca0bdc5c50241
--- /dev/null
+++ b/deps/SZ/sz/src/DynamicByteArray.c
@@ -0,0 +1,68 @@
+/**
+ *  @file DynamicByteArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Byte Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicByteArray.h"
+
+void new_DBA(DynamicByteArray **dba, size_t cap) {
+		*dba = (DynamicByteArray *)malloc(sizeof(DynamicByteArray));
+        (*dba)->size = 0;
+        (*dba)->capacity = cap;
+        (*dba)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*bytes = (unsigned char*)malloc(size * sizeof(unsigned char));
+	else
+		*bytes = NULL;
+	memcpy(*bytes, dba->array, size*sizeof(unsigned char));	
+}
+
+void free_DBA(DynamicByteArray *dba)
+{
+	free(dba->array);
+	free(dba);
+}
+
+inline unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos)
+{
+	if(pos>=dba->size)
+	{
+		printf("Error: wrong position of DBA (impossible case unless bugs elsewhere in the code?).\n");
+		exit(0);
+	}
+	return dba->array[pos];
+}
+
+inline void addDBA_Data(DynamicByteArray *dba, unsigned char value)
+{
+	if(dba->size==dba->capacity)
+	{
+		dba->capacity = dba->capacity << 1;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	dba->array[dba->size] = value;
+	dba->size ++;
+}
+
+inline void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length)
+{
+	if(dba->size + length > dba->capacity)
+	{
+		dba->capacity = dba->size + length;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	memcpy(&(dba->array[dba->size]), data, length);
+	dba->size += length;
+}
diff --git a/deps/SZ/sz/src/DynamicDoubleArray.c b/deps/SZ/sz/src/DynamicDoubleArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..54bbb109aaa500e6412357f5504e1616e76ed03f
--- /dev/null
+++ b/deps/SZ/sz/src/DynamicDoubleArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicDoubleArray.h"
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap) {
+		*dda = (DynamicDoubleArray *)malloc(sizeof(DynamicDoubleArray));
+        (*dda)->size = 0;
+        (*dda)->capacity = cap;
+        (*dda)->array = (double*)malloc(sizeof(double)*cap);
+    }
+
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*data = (double*)malloc(size * sizeof(double));
+	else
+		*data = NULL;
+	memcpy(*data, dba->array, size*sizeof(double));	
+}
+
+void free_DDA(DynamicDoubleArray *dda)
+{
+	free(dda->array);
+	free(dda);
+}
+
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos)
+{
+	if(pos>=dda->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dda->array[pos];
+}
+
+void addDDA_Data(DynamicDoubleArray *dda, double value)
+{
+	if(dda->size==dda->capacity)
+	{
+		dda->capacity *= 2;
+		dda->array = (double *)realloc(dda->array, dda->capacity*sizeof(double));
+	}
+	dda->array[dda->size] = value;
+	dda->size ++;
+}
diff --git a/deps/SZ/sz/src/DynamicFloatArray.c b/deps/SZ/sz/src/DynamicFloatArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..1a80a4888f79998b706d318fadd79485a3f19ca4
--- /dev/null
+++ b/deps/SZ/sz/src/DynamicFloatArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicFloatArray.h"
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap) {
+		*dfa = (DynamicFloatArray *)malloc(sizeof(DynamicFloatArray));
+        (*dfa)->size = 0;
+        (*dfa)->capacity = cap;
+        (*dfa)->array = (float*)malloc(sizeof(float)*cap);
+    }
+
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
+{
+	size_t size = dfa->size;
+	if(size>0)
+		*data = (float*)malloc(size * sizeof(float));
+	else
+		*data = NULL;
+	memcpy(*data, dfa->array, size*sizeof(float));	
+}
+
+void free_DFA(DynamicFloatArray *dfa)
+{
+	free(dfa->array);
+	free(dfa);
+}
+
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos)
+{
+	if(pos>=dfa->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dfa->array[pos];
+}
+
+void addDFA_Data(DynamicFloatArray *dfa, float value)
+{
+	if(dfa->size==dfa->capacity)
+	{
+		dfa->capacity *= 2;
+		dfa->array = (float *)realloc(dfa->array, dfa->capacity*sizeof(float));
+	}
+	dfa->array[dfa->size] = value;
+	dfa->size++;
+}
diff --git a/deps/SZ/sz/src/DynamicIntArray.c b/deps/SZ/sz/src/DynamicIntArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..347e3a18080b53b0ce10890728f61262ddeee1b2
--- /dev/null
+++ b/deps/SZ/sz/src/DynamicIntArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicIntArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Int Array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicIntArray.h"
+
+void new_DIA(DynamicIntArray **dia, size_t cap) {
+		*dia = (DynamicIntArray *)malloc(sizeof(DynamicIntArray));
+        (*dia)->size = 0;
+        (*dia)->capacity = cap;
+        (*dia)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
+{
+	size_t size = dia->size;
+	if(size>0)
+		*data = (unsigned char*)malloc(size * sizeof(char));
+	else
+		*data = NULL;
+	memcpy(*data, dia->array, size*sizeof(unsigned char));	
+}
+
+void free_DIA(DynamicIntArray *dia)
+{
+	free(dia->array);
+	free(dia);
+}
+
+int getDIA_Data(DynamicIntArray *dia, size_t pos)
+{
+	if(pos>=dia->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dia->array[pos];
+}
+
+inline void addDIA_Data(DynamicIntArray *dia, int value)
+{
+	if(dia->size==dia->capacity)
+	{
+		dia->capacity = dia->capacity << 1;
+		dia->array = (unsigned char *)realloc(dia->array, dia->capacity*sizeof(unsigned char));
+	}
+	dia->array[dia->size] = (unsigned char)value;
+	dia->size ++;
+}
diff --git a/deps/SZ/sz/src/Huffman.c b/deps/SZ/sz/src/Huffman.c
new file mode 100644
index 0000000000000000000000000000000000000000..f0f95134a1b0c2428a60860016e4767327562035
--- /dev/null
+++ b/deps/SZ/sz/src/Huffman.c
@@ -0,0 +1,932 @@
+/**
+ *  @file Huffman.c
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Customized Huffman Encoding, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "Huffman.h"
+#include "sz.h"
+
+
+HuffmanTree* createHuffmanTree(int stateNum)
+{			
+	HuffmanTree *huffmanTree = (HuffmanTree*)malloc(sizeof(HuffmanTree));
+	memset(huffmanTree, 0, sizeof(HuffmanTree));
+	huffmanTree->stateNum = stateNum;
+	huffmanTree->allNodes = 2*stateNum;
+	
+	huffmanTree->pool = (struct node_t*)malloc(huffmanTree->allNodes*2*sizeof(struct node_t));
+	huffmanTree->qqq = (node*)malloc(huffmanTree->allNodes*2*sizeof(node));
+	huffmanTree->code = (unsigned long**)malloc(huffmanTree->stateNum*sizeof(unsigned long*));
+	huffmanTree->cout = (unsigned char *)malloc(huffmanTree->stateNum*sizeof(unsigned char));
+	
+	memset(huffmanTree->pool, 0, huffmanTree->allNodes*2*sizeof(struct node_t));
+	memset(huffmanTree->qqq, 0, huffmanTree->allNodes*2*sizeof(node));
+    memset(huffmanTree->code, 0, huffmanTree->stateNum*sizeof(unsigned long*));
+    memset(huffmanTree->cout, 0, huffmanTree->stateNum*sizeof(unsigned char));
+	huffmanTree->qq = huffmanTree->qqq - 1;
+	huffmanTree->n_nodes = 0;
+    huffmanTree->n_inode = 0;
+    huffmanTree->qend = 1;	
+    
+    return huffmanTree;
+}
+
+HuffmanTree* createDefaultHuffmanTree()
+{
+	int maxRangeRadius = 32768;
+	int stateNum = maxRangeRadius << 1; //*2
+
+    return createHuffmanTree(stateNum);
+}
+ 
+node new_node(HuffmanTree* huffmanTree, size_t freq, unsigned int c, node a, node b)
+{
+	node n = huffmanTree->pool + huffmanTree->n_nodes++;
+	if (freq) 
+	{
+		n->c = c;
+		n->freq = freq;
+		n->t = 1;
+	}
+	else {
+		n->left = a; 
+		n->right = b;
+		n->freq = a->freq + b->freq;
+		n->t = 0;
+		//n->c = 0;
+	}
+	return n;
+}
+ 
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t)
+{
+	huffmanTree->pool[huffmanTree->n_nodes].c = c;
+	huffmanTree->pool[huffmanTree->n_nodes].t = t;
+	return huffmanTree->pool + huffmanTree->n_nodes++;
+} 
+ 
+/* priority queue */
+void qinsert(HuffmanTree *huffmanTree, node n)
+{
+	int j, i = huffmanTree->qend++;
+	while ((j = (i>>1)))  //j=i/2
+	{
+		if (huffmanTree->qq[j]->freq <= n->freq) break;
+		huffmanTree->qq[i] = huffmanTree->qq[j], i = j;
+	}
+	huffmanTree->qq[i] = n;
+}
+ 
+node qremove(HuffmanTree* huffmanTree)
+{
+	int i, l;
+	node n = huffmanTree->qq[i = 1];
+	node p;
+	if (huffmanTree->qend < 2) return 0;
+	huffmanTree->qend --;
+	huffmanTree->qq[i] = huffmanTree->qq[huffmanTree->qend];
+	
+	while ((l = (i<<1)) < huffmanTree->qend)  //l=(i*2)
+	{
+		if (l + 1 < huffmanTree->qend && huffmanTree->qq[l + 1]->freq < huffmanTree->qq[l]->freq) l++;
+		if(huffmanTree->qq[i]->freq > huffmanTree->qq[l]->freq)
+		{
+			p = huffmanTree->qq[i];
+			huffmanTree->qq[i] = huffmanTree->qq[l];
+			huffmanTree->qq[l] = p;
+			i = l;			
+		}	
+		else
+		{
+			break;
+		}
+		
+	}
+	
+	return n;
+}
+ 
+/* walk the tree and put 0s and 1s */
+/**
+ * @out1 should be set to 0.
+ * @out2 should be 0 as well.
+ * @index: the index of the byte
+ * */
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2)
+{
+	if (n->t) {
+		huffmanTree->code[n->c] = (unsigned long*)malloc(2*sizeof(unsigned long));
+		if(len<=64)
+		{
+			(huffmanTree->code[n->c])[0] = out1 << (64 - len);
+			(huffmanTree->code[n->c])[1] = out2;
+		}
+		else
+		{
+			(huffmanTree->code[n->c])[0] = out1;
+			(huffmanTree->code[n->c])[1] = out2 << (128 - len);
+		}
+		huffmanTree->cout[n->c] = (unsigned char)len;
+		return;
+	}
+	int index = len >> 6; //=len/64
+	if(index == 0)
+	{
+		out1 = out1 << 1;
+		out1 = out1 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, 0);
+		out1 = out1 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, 0);		
+	}
+	else
+	{
+		if(len%64!=0)
+			out2 = out2 << 1;
+		out2 = out2 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, out2);
+		out2 = out2 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, out2);	
+	}
+}
+
+/**
+ * Compute the frequency of the data and build the Huffman tree
+ * @param HuffmanTree* huffmanTree (output)
+ * @param int *s (input)
+ * @param size_t length (input)
+ * */
+void init(HuffmanTree* huffmanTree, int *s, size_t length)
+{
+	size_t i, index;
+	size_t *freq = (size_t *)malloc(huffmanTree->allNodes*sizeof(size_t));
+	memset(freq, 0, huffmanTree->allNodes*sizeof(size_t));
+	for(i = 0;i < length;i++)
+	{
+		index = s[i];
+		freq[index]++;
+	}
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i])
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+
+	while (huffmanTree->qend > 2)
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	free(freq);
+}
+
+void init_static(HuffmanTree* huffmanTree, int *s, size_t length)
+{
+	size_t i;
+	size_t *freq = (size_t *)malloc(huffmanTree->allNodes*sizeof(size_t));
+	memset(freq, 0, huffmanTree->allNodes*sizeof(size_t));
+
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i])
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+
+	while (huffmanTree->qend > 2)
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	free(freq);
+}
+ 
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize)
+{
+	size_t i = 0;
+	unsigned char bitSize = 0, byteSize, byteSizep;
+	int state;
+	unsigned char *p = out;
+	int lackBits = 0;
+	//long totalBitSize = 0, maxBitSize = 0, bitSize21 = 0, bitSize32 = 0;
+	for (i = 0;i<length;i++) 
+	{
+		state = s[i];
+		bitSize = huffmanTree->cout[state];	
+		
+		//printf("%d %d : %d %u\n",i, state, bitSize, (code[state])[0] >> (64-cout[state])); 
+		//debug: compute the average bitSize and the count that is over 32... 	
+		/*if(bitSize>=21)
+			bitSize21++;
+		if(bitSize>=32)
+			bitSize32++;
+		if(maxBitSize<bitSize)
+			maxBitSize = bitSize;
+		totalBitSize+=bitSize;*/
+
+		if(lackBits==0)
+		{
+			byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1; //it's equal to the number of bytes involved (for *outSize)
+			byteSizep = bitSize/8; //it's used to move the pointer p for next data
+			if(byteSize<=8)
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += byteSizep;
+			}
+			else //byteSize>8
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += 8;
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[1]);
+				p += (byteSizep - 8);
+			}
+			*outSize += byteSize;
+			lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+		}
+		else
+		{
+			*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));
+			if(lackBits < bitSize)
+			{
+				p++;
+				//(*outSize)++;
+				long newCode = (huffmanTree->code[state])[0] << lackBits;
+				longToBytes_bigEndian(p, newCode);
+
+				if(bitSize<=64)
+				{
+					bitSize -= lackBits;
+					byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+					byteSizep = bitSize/8;
+					p += byteSizep;
+					(*outSize)+=byteSize;
+					lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+				}
+				else //bitSize > 64
+				{
+					byteSizep = 7; //must be 7 bytes, because lackBits!=0
+					p+=byteSizep;
+					(*outSize)+=byteSize;
+
+					bitSize -= 64;
+					if(lackBits < bitSize)
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));
+						p++;
+						//(*outSize)++;
+						newCode = (huffmanTree->code[state])[1] << lackBits;
+						longToBytes_bigEndian(p, newCode);
+						bitSize -= lackBits;
+						byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+						byteSizep = bitSize/8;
+						p += byteSizep;
+						(*outSize)+=byteSize;
+						lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+					}
+					else //lackBits >= bitSize
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - bitSize));
+						lackBits -= bitSize;
+					}
+				}
+			}
+			else //lackBits >= bitSize
+			{
+				lackBits -= bitSize;
+				if(lackBits==0)
+					p++;
+			}
+		}
+	}
+//	for(i=0;i<stateNum;i++)
+//		if(code[i]!=NULL) free(code[i]);
+	/*printf("max bitsize = %d\n", maxBitSize);
+	printf("bitSize21 ratio = %f\n", ((float)bitSize21)/length);
+	printf("bitSize32 ratio = %f\n", ((float)bitSize32)/length);
+	printf("avg bit size = %f\n", ((float)totalBitSize)/length);*/
+}
+ 
+void decode(unsigned char *s, size_t targetLength, node t, int *out)
+{
+	size_t i = 0, byteIndex = 0, count = 0;
+	int r; 
+	node n = t;
+	
+	if(n->t) //root->t==1 means that all state values are the same (constant)
+	{
+		for(count=0;count<targetLength;count++)
+			out[count] = n->c;
+		return;
+	}
+	
+	for(i=0;count<targetLength;i++)
+	{
+		
+		byteIndex = i>>3; //i/8
+		r = i%8;
+		if(((s[byteIndex] >> (7-r)) & 0x01) == 0)
+			n = n->left;
+		else
+			n = n->right;
+
+		if (n->t) {
+			//putchar(n->c); 
+			out[count] = n->c;
+			n = t; 
+			count++;
+		}
+	}
+//	putchar('\n');
+	if (t != n) printf("garbage input\n");
+	return;
+}
+
+void decode_MSST19(unsigned char *s, size_t targetLength, node t, int *out, int maxBits)
+{
+	size_t count = 0;
+	node n = t;
+
+	if(n->t) //root->t==1 means that all state values are the same (constant)
+	{
+		for(count=0;count<targetLength;count++)
+			out[count] = n->c;
+		return;
+	}
+
+	if(maxBits > 16){
+		maxBits = 16;
+	}
+
+    int tableSize = 1 << maxBits;
+    int* valueTable = (int*)malloc(tableSize * sizeof(int));
+    uint8_t* lengthTable = (uint8_t*)malloc(tableSize * sizeof(int));
+    node* nodeTable = (node*)malloc(tableSize * sizeof(node));
+    uint32_t maskTable[maxBits+8];
+    int j;
+    for(uint32_t i=0; i<tableSize; i++){
+        n = t;
+        j = 0;
+        while(!n->t && j < maxBits){
+            uint32_t res = i >> (maxBits - j - 1);
+            if((res & 0x00000001) == 0){
+                n = n->left;
+            }else{
+                n = n->right;
+            }
+            j++;
+        }
+        if(!n->t){
+        	nodeTable[i] = n;
+        	valueTable[i] = -1;
+        	lengthTable[i] = maxBits;
+        }else{
+			valueTable[i] = n->c;
+			lengthTable[i] = j;
+        }
+    }
+    for(int i=0; i<maxBits+8; i++){
+        maskTable[i] = (1 << (maxBits+8-i-1)) - 1;
+    }
+
+    int leftBits = 0;
+	uint32_t currentValue = 0;
+	size_t i = 0;
+
+    while(count<targetLength)
+	{
+	    while(leftBits < maxBits){
+	        currentValue = currentValue << 8;
+	        currentValue += s[i];
+	        leftBits += 8;
+	        i++;
+	    }
+
+        uint32_t index = currentValue >> (leftBits - maxBits);
+        int value = valueTable[index];
+        if(value != -1){
+			out[count] = value;
+			int bitLength = lengthTable[index];
+			leftBits -= bitLength;
+			uint32_t avoidHeadMask = maskTable[maxBits + 8 - leftBits - 1];
+			currentValue = (currentValue & avoidHeadMask);
+			count++;
+        }else{
+			int bitLength = lengthTable[index];
+			leftBits -= bitLength;
+        	n = nodeTable[index];
+        	while(!n->t){
+        		if(!leftBits){
+					currentValue = currentValue << 8;
+					currentValue += s[i];
+					leftBits += 8;
+					i++;
+        		}
+				if(((currentValue >> (leftBits - 1)) & 0x01) == 0)
+					n = n->left;
+				else
+					n = n->right;
+				leftBits--;
+        	}
+        	currentValue &= maskTable[maxBits + 8 - leftBits - 1];
+			out[count] = n->c;
+			count++;
+        }
+
+	}
+    free(valueTable);
+    free(lengthTable);
+    free(nodeTable);
+	return;
+}
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, rroot);
+	}
+}  
+
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}	
+}
+
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}
+}
+ 
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out) 
+{
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+
+		pad_tree_uchar(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);	
+		*out = (unsigned char*)malloc(totalSize*sizeof(unsigned char));
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned char),R,nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), t, nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;
+
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));		
+		pad_tree_ushort(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+		unsigned int totalSize = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char) + nodeCount*sizeof(unsigned int);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;		
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned short),R,nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return totalSize;
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		pad_tree_uint(huffmanTree, L,R,C,t,0,huffmanTree->qq[1]);
+		
+		//debug
+		//node root = new_node2(0,0);
+		//unpad_tree_uint(L,R,C,t,0,root);		
+		
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned int),R,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned int),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+3*nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;		
+	}
+}
+
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned char l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned short l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned int l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount)
+{
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1+2*nodeCount*sizeof(unsigned char));
+			size_t i = 0, size = nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}		
+		}
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned char));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned char));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned int));	
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));
+		node root = new_node2(huffmanTree, C[0],t[0]);
+		unpad_tree_uchar(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));	
+				
+		unsigned char cmpSysEndianType = bytes[0];	
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 2*nodeCount*sizeof(unsigned short);
+			
+			while(1)
+			{
+				symTransform_2bytes(p);
+				i+=sizeof(unsigned short);
+				if(i<size)
+					p+=sizeof(unsigned short);
+				else
+					break;
+			}
+			
+			size = nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;				
+			}
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned short));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned short));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned int));	
+
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));	
+
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_ushort(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return root;				
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 3*nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned int));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));	
+	
+		memcpy(t, bytes+1+3*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));			
+					
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_uint(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+}
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
+{
+	size_t i; 
+	int nodeCount = 0;
+	unsigned char *treeBytes, buffer[4];
+	
+	init(huffmanTree, s, length);
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
+	//printf("treeByteSize = %d\n", treeByteSize);
+
+	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
+	intToBytes_bigEndian(buffer, nodeCount);
+	memcpy(*out, buffer, 4);
+	intToBytes_bigEndian(buffer, huffmanTree->stateNum/2); //real number of intervals
+	memcpy(*out+4, buffer, 4);
+	memcpy(*out+8, treeBytes, treeByteSize);
+	free(treeBytes);
+	size_t enCodeSize = 0;
+	encode(huffmanTree, s, length, *out+8+treeByteSize, &enCodeSize);
+	*outSize = 8+treeByteSize+enCodeSize;
+}
+
+int encode_withTree_MSST19(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
+{
+	//struct ClockPoint clockPointInit;
+	//TimeDurationStart("init", &clockPointInit);
+	size_t i;
+	int nodeCount = 0;
+	unsigned char *treeBytes, buffer[4];
+
+	init(huffmanTree, s, length);
+
+	int maxBits = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]){
+			nodeCount++;
+			if(huffmanTree->cout[i] > maxBits) maxBits = huffmanTree->cout[i];
+		}
+	nodeCount = nodeCount*2-1;
+	//TimeDurationEnd(&clockPointInit);
+	//struct ClockPoint clockPointST;
+	//TimeDurationStart("save tree", &clockPointST);
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
+	//printf("treeByteSize = %d\n", treeByteSize);
+
+	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
+	intToBytes_bigEndian(buffer, nodeCount);
+	memcpy(*out, buffer, 4);
+	intToBytes_bigEndian(buffer, huffmanTree->stateNum/2); //real number of intervals
+	memcpy(*out+4, buffer, 4);
+	memcpy(*out+8, treeBytes, treeByteSize);
+	free(treeBytes);
+	size_t enCodeSize = 0;
+	//TimeDurationEnd(&clockPointST);
+	//struct ClockPoint clockPointEncode;
+	//TimeDurationStart("encode", &clockPointEncode);
+	encode(huffmanTree, s, length, *out+8+treeByteSize, &enCodeSize);
+	*outSize = 8+treeByteSize+enCodeSize;
+	//TimeDurationEnd(&clockPointEncode);
+	//unsigned short state[length];
+	//decode(*out+4+treeByteSize, enCodeSize, qqq[0], state);
+	//printf("dataSeriesLength=%d",length );
+	return maxBits;
+}
+
+/**
+ * @par *out rememmber to allocate targetLength short_type data for it beforehand.
+ * 
+ * */
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out)
+{
+	size_t encodeStartIndex;
+	size_t nodeCount = bytesToInt_bigEndian(s);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+8, nodeCount);
+
+	//sdi: Debug
+/*	build_code(root, 0, 0, 0);
+	int i;
+	unsigned long code_1, code_2;
+	for (i = 0; i < stateNum; i++)
+		if (code[i])
+		{
+			printf("%d: %lu,%lu ; %u\n", i, (code[i])[0],(code[i])[1], cout[i]);
+			//code_1 = (code[i])[0];
+		}*/
+
+	if(nodeCount<=256)
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else if(nodeCount<=65536)
+		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+	decode(s+8+encodeStartIndex, targetLength, root, out);
+}
+
+void decode_withTree_MSST19(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out, int maxBits)
+{
+	size_t encodeStartIndex;
+	size_t nodeCount = bytesToInt_bigEndian(s);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+8, nodeCount);
+
+	//sdi: Debug
+/*	build_code(root, 0, 0, 0);
+	int i;
+	unsigned long code_1, code_2;
+	for (i = 0; i < stateNum; i++)
+		if (code[i])
+		{
+			printf("%d: %lu,%lu ; %u\n", i, (code[i])[0],(code[i])[1], cout[i]);
+			//code_1 = (code[i])[0];
+		}*/
+
+	if(nodeCount<=256)
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else if(nodeCount<=65536)
+		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+
+	decode_MSST19(s+8+encodeStartIndex, targetLength, root, out, maxBits);
+}
+
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree)
+{
+	size_t i;
+	free(huffmanTree->pool);
+	huffmanTree->pool = NULL;
+	free(huffmanTree->qqq);
+	huffmanTree->qqq = NULL;
+	for(i=0;i<huffmanTree->stateNum;i++)
+	{
+		if(huffmanTree->code[i]!=NULL)
+			free(huffmanTree->code[i]);
+	}
+	free(huffmanTree->code);
+	huffmanTree->code = NULL;
+	free(huffmanTree->cout);
+	huffmanTree->cout = NULL;	
+	free(huffmanTree);
+	huffmanTree = NULL;
+}
diff --git a/deps/SZ/sz/src/MultiLevelCacheTable.c b/deps/SZ/sz/src/MultiLevelCacheTable.c
new file mode 100644
index 0000000000000000000000000000000000000000..ce16b7c661cdeb4ff68afa50a27ae3f0d94857a7
--- /dev/null
+++ b/deps/SZ/sz/src/MultiLevelCacheTable.c
@@ -0,0 +1,193 @@
+/**
+ *  @file MultiLevelCacheTable.c
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+#include "MultiLevelCacheTable.h"
+
+uint8_t MLCT_GetExpoIndex(float value){
+    uint32_t* ptr = (uint32_t*)&value;
+    return (*ptr) >> 23;
+}
+
+uint8_t MLCT_GetRequiredBits(float precision){
+    int32_t* ptr = (int32_t*)&precision;
+    return -(((*ptr) >> 23) - 127);
+}
+
+
+uint32_t MLCT_GetMantiIndex(float value, int bits){
+    uint32_t* ptr = (uint32_t*)&value;
+    (*ptr) = (*ptr) << 9 >> 9;
+    int shift = 32 - 9 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+float MLTC_RebuildFloat(uint8_t expo, uint32_t manti, int bits){
+    float result = 0;
+    uint32_t *ptr = (uint32_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 23;
+    (*ptr) |= (manti << (23-bits));
+    return result;
+}
+
+void MultiLevelCacheTableBuild(struct TopLevelTable* topTable, float* precisionTable, int count, float precision){
+    uint8_t bits = MLCT_GetRequiredBits(precision);
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCT_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCT_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTable*)malloc(sizeof(struct SubLevelTable) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTable) * subTableCount);
+
+    //uint32_t expoBoundary[subTableCount];
+    uint8_t lastExpo = 0xff;
+    uint8_t lastIndex = 0;
+    for(int i=0; i<count; i++){
+        uint8_t expo = MLCT_GetExpoIndex(precisionTable[i]);
+        if(expo != lastExpo){
+            //expoBoundary[lastIndex] = i;
+            lastExpo = expo;
+            lastIndex++;
+        }
+    }
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        if(i == topTable->topIndex - topTable->baseIndex &&
+            MLCT_GetExpoIndex(topTable->topBoundary) == MLCT_GetExpoIndex(precisionTable[count-1])){
+            processingSubTable->topIndex = MLCT_GetMantiIndex(topTable->topBoundary, bits) - 1;
+        }else{
+            uint32_t maxIndex = 0;
+            for(int j=0; j<bits; j++){
+                maxIndex += 1 << j;
+            }
+            processingSubTable->topIndex = maxIndex;
+        }
+        if(i == 0 && MLCT_GetExpoIndex(topTable->bottomBoundary) == MLCT_GetExpoIndex(precisionTable[0])){
+            processingSubTable->baseIndex = MLCT_GetMantiIndex(topTable->bottomBoundary, bits)+1;
+        }else{
+            processingSubTable->baseIndex = 0;
+        }
+
+        int subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint32_t*)malloc(sizeof(uint32_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint32_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+    uint32_t index = 1;
+    for(uint8_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        uint8_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint32_t mantiIndex = j+processingSubTable->baseIndex;
+            float sample = MLTC_RebuildFloat(expoIndex, mantiIndex, topTable->bits);
+            float bottomBoundary = precisionTable[index] / (1+precision);
+            float topBoundary = precisionTable[index] / (1-precision);
+            if(sample < topBoundary && sample > bottomBoundary){
+                processingSubTable->table[j] = index;
+            }else{
+                //float newPrecision = precisionTable[index];
+                index++;
+                processingSubTable->table[j] = index;
+                if(j)
+                    processingSubTable->table[j-1] = index;
+                else{
+                    struct SubLevelTable* pastSubTable = &topTable->subTables[i-1];
+                    pastSubTable->table[pastSubTable->topIndex - pastSubTable->baseIndex] = index;
+                }
+            }
+        }
+        if(i == topTable->topIndex - topTable->baseIndex){
+            uint32_t j = processingSubTable->topIndex - processingSubTable->baseIndex + 1;
+            uint32_t mantiIndex = j + processingSubTable->baseIndex;
+            float sample = MLTC_RebuildFloat(expoIndex, mantiIndex, topTable->bits);
+            float bottomBoundary = precisionTable[index] / (1+precision);
+            float topBoundary = precisionTable[index] / (1-precision);
+            if(sample > topBoundary || sample < bottomBoundary){
+                index++;
+                processingSubTable->table[j-1] = index;
+            }
+        }
+    }
+
+    /*
+    long lastIndexInExpoRange = count-1;
+    bool trigger = false;
+    float preRange = 0.0;
+    uint32_t preIndex = 0;
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        if(trigger){
+            uint32_t bound = MLCT_GetMantiIndex(preRange, bits);
+            for(int j = processingSubTable->topIndex; j>=processingSubTable->baseIndex; j--){
+                if(j >= bound){
+                    processingSubTable->table[j-processingSubTable->baseIndex] = preIndex;
+                }else{
+                    break;
+                }
+            }
+            trigger = false;
+        }
+        long firstIndexInExpoRange = expoBoundary[i];
+        uint8_t expoInRange = MLCT_GetExpoIndex(precisionTable[firstIndexInExpoRange]);
+        for(int j=lastIndexInExpoRange; j>=firstIndexInExpoRange; j--){
+            float test = precisionTable[j];
+            uint32_t rangeTop = MLCT_GetMantiIndex(precisionTable[j]*(1+precision), bits) - 1;
+            uint32_t rangeBottom;
+            if(j == firstIndexInExpoRange){
+                preRange = precisionTable[j]/(1+precision);
+                if(expoInRange != MLCT_GetExpoIndex(preRange)){
+                    trigger = true;
+                    preIndex = firstIndexInExpoRange;
+                    rangeBottom = 0;
+                }else{
+                    rangeBottom= MLCT_GetMantiIndex(precisionTable[j]/(1+precision), bits) + 1;
+                }
+            }else{
+                rangeBottom= MLCT_GetMantiIndex(precisionTable[j]/(1+precision), bits) + 1;
+            }
+            for(int k = rangeBottom; k<=rangeTop; k++){
+                if( k <= processingSubTable->topIndex && k >= processingSubTable->baseIndex)
+                    processingSubTable->table[k - processingSubTable->baseIndex] = j;
+            }
+        }
+        lastIndexInExpoRange = firstIndexInExpoRange-1;
+    }
+     */
+}
+
+uint32_t MultiLevelCacheTableGetIndex(float value, struct TopLevelTable* topLevelTable){
+    uint8_t expoIndex = MLCT_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTable* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint32_t mantiIndex = MLCT_GetMantiIndex(value, topLevelTable->bits);
+        MLTC_RebuildFloat(expoIndex, mantiIndex, topLevelTable->bits);
+        if(mantiIndex >= subLevelTable->baseIndex && mantiIndex <= subLevelTable->topIndex)
+            return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableFree(struct TopLevelTable* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
diff --git a/deps/SZ/sz/src/MultiLevelCacheTableWideInterval.c b/deps/SZ/sz/src/MultiLevelCacheTableWideInterval.c
new file mode 100644
index 0000000000000000000000000000000000000000..d137115f9097d8803fbae46e41af43f31cfb6484
--- /dev/null
+++ b/deps/SZ/sz/src/MultiLevelCacheTableWideInterval.c
@@ -0,0 +1,125 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdbool.h>
+#include "MultiLevelCacheTableWideInterval.h"
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable)
+{
+	for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--)
+	{
+		struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+		free(processingSubTable->table);
+	}
+	free(topTable->subTables);
+}
+
+uint16_t MLCTWI_GetExpoIndex(double value){
+    uint64_t* ptr = (uint64_t*)&value;
+    return (*ptr) >> 52;
+}
+
+uint16_t MLCTWI_GetRequiredBits(double precision){
+    uint64_t* ptr = (uint64_t*)&precision;
+    return -(((*ptr) >> 52) - 1023);
+}
+
+uint64_t MLCTWI_GetMantiIndex(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    (*ptr) = (*ptr) << 12 >> 12;
+    int shift = 64 - 12 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits){
+    double result = 0;
+    uint64_t *ptr = (uint64_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 52;
+    (*ptr) += (manti << (52-bits));
+    return result;
+}
+
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits){
+    uint16_t bits = MLCTWI_GetRequiredBits(precision) + plus_bits;
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCTWI_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCTWI_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTableWideInterval*)malloc(sizeof(struct SubLevelTableWideInterval) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTableWideInterval) * subTableCount);
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+
+        uint32_t maxIndex = 0;
+        for(int j=0; j<bits; j++){
+            maxIndex += 1 << j;
+        }
+        processingSubTable->topIndex = maxIndex;
+        processingSubTable->baseIndex = 0;
+
+        uint64_t subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint16_t*)malloc(sizeof(uint16_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint16_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+
+    uint32_t index = 0;
+    bool flag = false;
+    for(uint16_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+        uint16_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint64_t mantiIndex = j + processingSubTable->baseIndex;
+            double sampleBottom = MLTCWI_RebuildDouble(expoIndex, mantiIndex, topTable->bits);
+            double sampleTop = MLTCWI_RebuildDouble(expoIndex, mantiIndex+1, topTable->bits);
+            double bottomBoundary = precisionTable[index] / (1+precision);
+            double topBoundary = precisionTable[index] / (1-precision);
+            if(sampleTop < topBoundary && sampleBottom > bottomBoundary){
+                processingSubTable->table[j] = index;
+                flag = true;
+            }else{
+                if(flag && index < count-1){
+                    index++;
+                    processingSubTable->table[j] = index;
+                }else{
+                    processingSubTable->table[j] = 0;
+                }
+            }
+        }
+    }
+
+}
+
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable){
+    uint16_t expoIndex = MLCTWI_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTableWideInterval* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint64_t mantiIndex = MLCTWI_GetMantiIndex(value, topLevelTable->bits);
+        return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
+
diff --git a/deps/SZ/sz/src/TightDataPointStorageD.c b/deps/SZ/sz/src/TightDataPointStorageD.c
new file mode 100644
index 0000000000000000000000000000000000000000..f30d8cdd62f20a0bc1c43cdefdac1ce74b5b9c74
--- /dev/null
+++ b/deps/SZ/sz/src/TightDataPointStorageD.c
@@ -0,0 +1,751 @@
+/**
+ *  @file TightPointDataStorageD.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **this)
+{
+	*this = (TightDataPointStorageD*)malloc(sizeof(TightDataPointStorageD));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
+
+}
+
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageD_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+
+	int same = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+	int isPW_REL = (sameRByte & 0x20)>>5;
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	//confparams_dec->randomAccess = (sameRByte & 0x02) >> 1;
+	//confparams_dec->szMode = (sameRByte & 0x06) >> 1;						//this 0000,0110	are not used for szMode any more
+	confparams_dec->protectValueRange = (sameRByte & 0x04)>>2;
+	confparams_dec->accelerate_pw_rel_compression = (sameRByte & 0x08) >> 3;
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+
+	index += MetaDataByteLength_double;
+
+	int isRegression = (sameRByte >> 7) & 0x01;
+
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);
+
+	//printf("confparams_dec->szMode=%d\n",confparams_dec->szMode);
+
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		//size_t exactMidBytesLength = sizeof(double);//flatBytesLength - 3 - 1 - MetaDataByteLength_double -exe_params->SZ_SIZE_TYPE;
+		(*this)->exactMidBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+		
+	if(isRegression == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}					
+		
+	int rtype_ = 0;//sameRByte & 0x08; //1000		
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		confparams_dec->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToDouble(byteBuf);//8
+
+	(*this)->reqLength = flatBytes[index++]; //1
+	
+	if(isPW_REL && confparams_dec->accelerate_pw_rel_compression)
+	{
+		(*this)->plus_bits = flatBytes[index++];
+		(*this)->max_bits = flatBytes[index++];
+	}
+	
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//ST		
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 8; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToDouble(byteBuf);//8
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+	
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 8;
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - exe_params->SZ_SIZE_TYPE - 8 - (*this)->rtypeArray_size 
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+	}	
+
+	if(errorBoundMode >= PW_REL){
+		(*this)->minLogValue = bytesToDouble(&flatBytes[index]);
+		index+=8;
+	}
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ * 
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageD(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		(*this)->max_bits = encode_withTree_MSST19(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	else
+		encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte	
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength_double;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];		
+	
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression==1)
+	{
+		bytes[k++] = tdps->plus_bits;
+		bytes[k++] = tdps->max_bits;
+	}
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+			
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];				
+				
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}		
+}
+
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char reservedValueBytes[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength_double;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//4
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];		
+	
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];			
+	
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];	
+	
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	doubleToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = reservedValueBytes[i];
+	
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+	
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;		
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;	
+	}	
+}
+
+//Convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+	
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); 	
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+	
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength_double;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+
+		int minLogValueSize = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+			minLogValueSize = 8;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ minLogValueSize /*max absolute log value*/
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+			
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_double(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0; //0000,0001
+	sameByte = sameByte | (confparams_cpr->szMode << 1); //0000,0110
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10); // 0001,0000
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 0010,0000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); //0100,0000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); //0000,1000, the 7th bit 	
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //0000,0100
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+	
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength_double;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE+ 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+		convertTDPStoBytes_double(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps)
+{
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL) 	
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps)
+{			
+	free(tdps);
+}
diff --git a/deps/SZ/sz/src/TightDataPointStorageF.c b/deps/SZ/sz/src/TightDataPointStorageF.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa0e0c1ce3b4d9f796e915f5f3cdc62cd6780e82
--- /dev/null
+++ b/deps/SZ/sz/src/TightDataPointStorageF.c
@@ -0,0 +1,754 @@
+/**
+ *  @file TightPointDataStorageF.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **this)
+{
+	*this = (TightDataPointStorageF*)malloc(sizeof(TightDataPointStorageF));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;	
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
+}
+
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageF_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+															      //note that 1000,0000 is reserved for regression tag.
+	int same = sameRByte & 0x01; 											//0000,0001
+	(*this)->isLossless = (sameRByte & 0x10)>>4; 							//0001,0000
+	int isPW_REL = (sameRByte & 0x20)>>5; 									//0010,0000
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4; 				//0100,0000
+	//confparams_dec->randomAccess = (sameRByte & 0x02) >> 1;
+	//confparams_dec->szMode = (sameRByte & 0x06) >> 1;			//0000,0110 (in fact, this szMode could be removed because convertSZParamsToBytes will overwrite it)
+	
+	confparams_dec->protectValueRange = (sameRByte & 0x04)>>2;
+	
+	confparams_dec->accelerate_pw_rel_compression = (sameRByte & 0x08) >> 3;//0000,1000
+
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+	
+	index += MetaDataByteLength;
+
+	int isRegression = (sameRByte >> 7) & 0x01;
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// 4 or 8	
+	
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		//size_t exactMidBytesLength = sizeof(double);//flatBytesLength - 3 - 1 - MetaDataByteLength -exe_params->SZ_SIZE_TYPE;
+		(*this)->exactMidBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+	if(isRegression == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}			
+
+	int rtype_ = 0;//sameRByte & 0x08;		//=00001000
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		confparams_dec->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToFloat(byteBuf); //4
+	
+	(*this)->reqLength = flatBytes[index++]; //1
+	
+	if(isPW_REL && confparams_dec->accelerate_pw_rel_compression)
+	{
+		(*this)->plus_bits = flatBytes[index++];
+		(*this)->max_bits = flatBytes[index++];
+	}
+	
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// 4		
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//(ST)
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToFloat(byteBuf);//4
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 4;
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - exe_params->SZ_SIZE_TYPE - 4 - (*this)->rtypeArray_size
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+	}
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->minLogValue = bytesToFloat(&flatBytes[index]);
+		index+=4;
+	}
+
+	(*this)->typeArray = &flatBytes[index]; 
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageF(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		(*this)->max_bits = encode_withTree_MSST19(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	else
+		encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageF2(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];			
+	
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+	{
+		bytes[k++] = tdps->plus_bits;
+		bytes[k++] = tdps->max_bits;
+	}
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];			
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+/*deprecated*/
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	unsigned char reservedValueBytes[4];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	floatToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	floatToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = reservedValueBytes[i];
+
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}	
+	
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+//convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0; //0000,0001
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);  //0000,0110 (no need because of convertSZParamsToBytes
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);  // 0001,0000
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 0010,0000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 0100,0000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); //0000,1000
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //0000,0100
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		int minLogValueSize = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+			minLogValueSize = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + minLogValueSize
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_float(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); 	
+				
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+		convertTDPStoBytes_float(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+/**
+ * to free the memory used in the compression
+ * */
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps)
+{
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL)
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps)
+{			
+	free(tdps);
+}
diff --git a/deps/SZ/sz/src/TightDataPointStorageI.c b/deps/SZ/sz/src/TightDataPointStorageI.c
new file mode 100644
index 0000000000000000000000000000000000000000..569f57987e226dcc551ff891be1508984adec032
--- /dev/null
+++ b/deps/SZ/sz/src/TightDataPointStorageI.c
@@ -0,0 +1,463 @@
+/**
+ *  @file TightPointDataStorageI.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+int computeRightShiftBits(int exactByteSize, int dataType)
+{
+	int rightShift = 0; 
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		rightShift = 8 - exactByteSize*8;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		rightShift = 16 - exactByteSize*8;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		rightShift = 32 - exactByteSize*8;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		rightShift = 64 - exactByteSize*8;
+		break;
+	}
+	return rightShift;	
+}
+
+int convertDataTypeSizeCode(int dataTypeSizeCode)
+{
+	int result = 0;
+	switch(dataTypeSizeCode)
+	{
+	case 0:
+		result = 1;
+		break;
+	case 1:
+		result = 2;
+		break;
+	case 2:
+		result = 4;
+		break;
+	case 3:
+		result = 8;
+		break;
+	}
+	return result;	
+}
+
+int convertDataTypeSize(int dataTypeSize)
+{
+	int result = 0;
+	switch(dataTypeSize)
+	{
+	case 1:
+		result = 0; //0000
+		break;
+	case 2:
+		result = 4; //0100
+		break;
+	case 4:
+		result = 8; //1000
+		break;
+	case 8:
+		result = 12; //1100
+		break;
+	}
+	return result;
+}
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **this)
+{
+	*this = (TightDataPointStorageI*)malloc(sizeof(TightDataPointStorageI));
+
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->realPrecision = 0;
+	(*this)->minValue = 0;
+	(*this)->exactByteSize = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+	
+	(*this)->exactDataBytes = NULL;
+	(*this)->exactDataBytes_size = 0;
+
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;	
+}
+
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageI_Empty(this);
+	size_t i, index = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+	int same = sameRByte & 0x01;
+	//conf_params->szMode = (sameRByte & 0x06)>>1;
+	int dataByteSizeCode = (sameRByte & 0x0C)>>2;
+	convertDataTypeSizeCode(dataByteSizeCode); //in bytes
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	int errorBoundMode = ABS;
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+	/*sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
+	int mode = confparams_dec->szMode;
+	int losslessCompressor = confparams_dec->losslessCompressor;
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;
+	confparams_dec->szMode = mode;
+	confparams_dec->losslessCompressor = losslessCompressor;*/
+	
+	index += MetaDataByteLength; //20	
+	
+	if(same==0)
+		(*this)->exactByteSize = flatBytes[index++]; //1
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// ST
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		(*this)->exactDataBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		printf("Error: errorBoundMode>=PW_REL in new_TightDataPointStorageI_fromFlatBytes!! Wrong...\n");
+		exit(0);
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->minValue = bytesToLong_bigEndian(byteBuf); //8
+		
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// ST		
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataBytes_size = bytesToSize(byteBuf);// ST		
+
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;		
+
+	index+=(*this)->typeArray_size;
+	
+	if((*this)->exactDataBytes_size > 0)
+	{	
+		(*this)->exactDataBytes = &flatBytes[index];
+		index+=(*this)->exactDataBytes_size*sizeof(char);	
+	}
+	else
+		(*this)->exactDataBytes = NULL;	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactDataBytes's length == exactDataBytes_size
+ * */
+void new_TightDataPointStorageI(TightDataPointStorageI **this,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType) 
+{
+	//int i = 0;
+	*this = (TightDataPointStorageI *)malloc(sizeof(TightDataPointStorageI));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->minValue = minValue;
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		(*this)->dataTypeSize = 1;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		(*this)->dataTypeSize = 2;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		(*this)->dataTypeSize = 4;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		(*this)->dataTypeSize = 8;
+		break;
+	}
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+	(*this)->exactByteSize = byteSize;
+
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactDataBytes = exactDataBytes;
+	(*this)->exactDataBytes_size = exactDataBytes_size;
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+}
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	
+	unsigned char byteBuffer[8] = {0,0,0,0,0,0,0,0};
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;	
+		
+	bytes[k++] = tdps->exactByteSize; //1 byte
+
+	sizeToBytes(byteBuffer, tdps->dataSeriesLength);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = byteBuffer[i];	
+	
+	intToBytes_bigEndian(byteBuffer, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];
+	
+	intToBytes_bigEndian(byteBuffer, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];			
+	
+	longToBytes_bigEndian(byteBuffer, tdps->minValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];
+
+	doubleToBytes(byteBuffer, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];			
+
+	sizeToBytes(byteBuffer, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+
+	memcpy(&(bytes[k]), tdps->exactDataBytes, tdps->exactDataBytes_size);
+	k += tdps->exactDataBytes_size;
+}
+
+//convert TightDataPointStorageI to bytes...
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	
+	int dataTypeSizeCode = convertDataTypeSize(tdps->dataTypeSize);
+	sameByte = (unsigned char) (sameByte | dataTypeSizeCode);
+	
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;			
+		
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			(*bytes)[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else 
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode >= PW_REL!! can't be...\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + 1 + exe_params->SZ_SIZE_TYPE + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_int(tdps, *bytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+		
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;	
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//ST
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			bytes[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode>=PW_REL!! can't be....\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		convertTDPStoBytes_int(tdps, bytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps)
+{
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->exactDataBytes!=NULL)
+		free(tdps->exactDataBytes);
+	free(tdps);
+}
+
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps)
+{
+	free(tdps);
+}
+
+
diff --git a/deps/SZ/sz/src/TypeManager.c b/deps/SZ/sz/src/TypeManager.c
new file mode 100644
index 0000000000000000000000000000000000000000..cf99a170c354906512972c890e1d07435cdbc0d3
--- /dev/null
+++ b/deps/SZ/sz/src/TypeManager.c
@@ -0,0 +1,503 @@
+/**
+ *  @file TypeManager.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "DynamicByteArray.h"
+#include "sz.h"
+
+//int convertIntArray2ByteArray_fast_8b()
+
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
+{
+    if(intArrayLength > byteArrayLength*8)
+    {
+    	printf("Error: intArrayLength > byteArrayLength*8\n");
+    	printf("intArrayLength=%zu, byteArrayLength = %zu", intArrayLength, byteArrayLength);
+    	exit(0);
+    }
+	if(intArrayLength>0)
+		*intArray = (unsigned char*)malloc(intArrayLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;    
+    
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0x80) >> 7;
+		(*intArray)[n++] = (tmp & 0x40) >> 6;
+		(*intArray)[n++] = (tmp & 0x20) >> 5;
+		(*intArray)[n++] = (tmp & 0x10) >> 4;
+		(*intArray)[n++] = (tmp & 0x08) >> 3;
+		(*intArray)[n++] = (tmp & 0x04) >> 2;
+		(*intArray)[n++] = (tmp & 0x02) >> 1;
+		(*intArray)[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x01) >> 0;		
+}
+
+/**
+ * little endian
+ * [01|10|11|00|....]-->[01|10|11|00][....]
+ * @param timeStepType
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		/*for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}*/
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			unsigned char type = timeStepType[n];
+			tmp = tmp | type << (6-(j<<1));
+			n++;
+		}
+		result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{
+	if(stepLength > byteArrayLength*4)
+	{
+		printf("Error: stepLength > byteArray.length*4\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i, n = 0;
+
+	for (i = 0; i < byteArrayLength; i++) {
+		unsigned char tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0xC0) >> 6;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x30) >> 4;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x0C) >> 2;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = tmp & 0x03;
+		if(n==stepLength)
+			break;
+	}
+}
+
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{	
+	size_t i = 0, k = 0, byteLength = 0, n = 0;
+	if(timeStepTypeLength%8==0)
+		byteLength = timeStepTypeLength*3/8;
+	else
+		byteLength = timeStepTypeLength*3/8+1;
+
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	int tmp = 0;
+	for(n = 0;n<timeStepTypeLength;n++)
+	{
+		k = n%8;
+		switch(k)
+		{
+		case 0:
+			tmp = tmp | (timeStepType[n] << 5);
+			break;
+		case 1:
+			tmp = tmp | (timeStepType[n] << 2);
+			break;
+		case 2: 
+			tmp = tmp | (timeStepType[n] >> 1);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 7);
+			break;
+		case 3:
+			tmp = tmp | (timeStepType[n] << 4);
+			break;
+		case 4:
+			tmp = tmp | (timeStepType[n] << 1);
+			break;
+		case 5:
+			tmp = tmp | (timeStepType[n] >> 2);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 6);
+			break;
+		case 6:
+			tmp = tmp | (timeStepType[n] << 3);
+			break;
+		case 7:
+			tmp = tmp | (timeStepType[n] << 0);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0;
+			break;
+		}
+	}
+	if(k!=7) //load the last one
+		(*result)[i] = (unsigned char)tmp;
+	
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{	
+	if(stepLength > byteArrayLength*8/3)
+	{
+		printf("Error: stepLength > byteArray.length*8/3, impossible case unless bugs elsewhere.\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);		
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i = 0, ii = 0, n = 0;
+	unsigned char tmp = byteArray[i];	
+	for(n=0;n<stepLength;)
+	{
+		switch(n%8)
+		{
+		case 0:
+			(*intArray)[n++] = (tmp & 0xE0) >> 5;
+			break;
+		case 1: 
+			(*intArray)[n++] = (tmp & 0x1C) >> 2;
+			break;
+		case 2:
+			ii = (tmp & 0x03) << 1;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0x80) >> 7;
+			(*intArray)[n++] = ii;
+			break;
+		case 3:
+			(*intArray)[n++] = (tmp & 0x70) >> 4;
+			break;
+		case 4:
+			(*intArray)[n++] = (tmp & 0x0E) >> 1;
+			break;
+		case 5:
+			ii = (tmp & 0x01) << 2;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = ii;
+			break;
+		case 6: 
+			(*intArray)[n++] = (tmp & 0x38) >> 3;
+			break;
+		case 7:
+			(*intArray)[n++] = (tmp & 0x07);
+			i++;
+			tmp = byteArray[i];
+			break;
+		}
+	}
+}
+
+inline int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+{
+	return 8 - k%8 - resiBitLength;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<nbEle;j++)
+	{
+		if(resiBitLength==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, resiBitLength);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += resiBitLength;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<resiBitLengthLength;j++)
+	{
+		unsigned char rbl = resiBitLength[j];
+		if(rbl==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, rbl);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += rbl;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+int computeBitNumRequired(size_t dataLength)
+{
+	if(exe_params->SZ_SIZE_TYPE==4)
+		return 32 - numberOfLeadingZeros_Int(dataLength);
+	else
+		return 64 - numberOfLeadingZeros_Long(dataLength);
+		
+}
+
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength)
+{
+	size_t pairLength = (bytesLength*8)/(validLength+1);
+	size_t tmpLength = pairLength*2;
+	int tmpResult[tmpLength];
+	size_t i, j, k = 0;
+	for(i = 0;i<tmpLength;i+=2)
+	{
+		size_t outIndex = k/8;
+		int innerIndex = k%8;
+
+		unsigned char curByte = bytes[outIndex];
+		tmpResult[i] = (curByte >> (8-1-innerIndex)) & 0x01;
+		k++;
+		
+		int numResult = extractBytes(bytes, k, validLength);
+		
+		tmpResult[i+1] = numResult;
+		k = k + validLength;
+	}
+	
+	*result = (int*)malloc(sizeof(int)*totalLength);
+	k = 0;
+	for(i = 0;i<tmpLength;i=i+2)
+	{
+		int state = tmpResult[i];
+		size_t num = tmpResult[i+1];
+		for(j = 0;j<num;j++)
+			(*result)[k++] = state;
+	}
+}
diff --git a/deps/SZ/sz/src/VarSet.c b/deps/SZ/sz/src/VarSet.c
new file mode 100644
index 0000000000000000000000000000000000000000..719c6774fddc8e89ac3d20a3a15430af97a66ca4
--- /dev/null
+++ b/deps/SZ/sz/src/VarSet.c
@@ -0,0 +1,254 @@
+/**
+ *  @file Variable.c
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "VarSet.h"
+#include "sz.h"
+
+void free_Variable_keepOriginalData(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);	
+	if(v->compressedBytes!=NULL)
+		free(v->compressedBytes);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);	
+	free(v);
+}
+
+/**
+ * 
+ * @deprecated
+ * */
+void free_Variable_keepCompressedBytes(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);	
+	free(v);
+}
+
+void free_Variable_all(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->compressedBytes!=NULL)
+		free(v->compressedBytes);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);
+	free(v);
+}
+
+void SZ_batchAddVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{	
+	if(sz_varset==NULL)
+	{
+		sz_varset = (SZ_VarSet*)malloc(sizeof(SZ_VarSet));
+		sz_varset->header = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+		sz_varset->header->next = NULL;
+		sz_varset->lastVar = sz_varset->header;
+		sz_varset->count = 0;		
+	}
+	
+	SZ_Variable* var = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+	memset(var, 0, sizeof(SZ_Variable));
+	var->var_id = var_id;
+	var->varName = (char*)malloc(strlen(varName)+1);
+	memcpy(var->varName, varName, strlen(varName)+1);
+	//var->varName = varName;
+	var->dataType = dataType;
+	var->r5 = r5;
+	var->r4 = r4;
+	var->r3 = r3;
+	var->r2 = r2;
+	var->r1 = r1;
+	var->errBoundMode = errBoundMode;
+	var->absErrBound = absErrBound;
+	var->relBoundRatio = relBoundRatio;
+	var->pwRelBoundRatio = pwRelBoundRatio;
+	var->data = data;
+	
+	var->multisteps = (sz_multisteps*)malloc(sizeof(sz_multisteps));
+	memset(var->multisteps, 0, sizeof(sz_multisteps));
+	
+	size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);
+	if(dataType==SZ_FLOAT)
+	{
+		var->multisteps->hist_data = (float*)malloc(sizeof(float)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(float)*dataLen);
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		var->multisteps->hist_data = (double*)malloc(sizeof(double)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(double)*dataLen);
+	}
+	var->compressedBytes = NULL;
+	var->next = NULL;
+	
+	sz_varset->count ++;
+	sz_varset->lastVar->next = var;
+	sz_varset->lastVar = var;
+}
+
+int SZ_batchDelVar_ID(int var_id)
+{
+	int state = SZ_batchDelVar_ID_vset(sz_varset, var_id);
+	return state;
+}
+
+int SZ_batchDelVar(char* varName)
+{
+	int state = SZ_batchDelVar_vset(sz_varset, varName);
+	return state;
+}
+
+int SZ_batchDelVar_ID_vset(SZ_VarSet* vset, int var_id)
+{
+	int delSuccess = SZ_NSCS;
+	SZ_Variable* p = vset->header;
+	SZ_Variable* q = p->next;
+	while(q != NULL)
+	{
+		if(q->var_id == var_id)
+		{
+			p->next = q->next;
+			//free_Variable_all(q);
+			free_Variable_keepOriginalData(q);
+			vset->count --;
+			delSuccess = SZ_SCES;
+			if(q->next==NULL) //means that q is the last variable
+				vset->lastVar = p;			
+			break;
+		}
+			
+		p = p->next;
+		q = q->next;	
+	}
+	
+	return delSuccess;	
+}
+
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName)
+{
+	int delSuccess = SZ_NSCS;
+	SZ_Variable* p = vset->header;
+	SZ_Variable* q = p->next;
+	while(q != NULL)
+	{
+		int cmpResult = strcmp(q->varName, varName);
+		if(cmpResult==0)
+		{
+			p->next = q->next;
+			//free_Variable_all(q);
+			free_Variable_keepOriginalData(q);
+			vset->count --;
+			delSuccess = SZ_SCES;
+			break;
+		}
+		p = p->next;
+		q = q->next;	
+	}
+	
+	return delSuccess;
+}
+
+SZ_Variable* SZ_searchVar(char* varName)
+{
+	SZ_Variable* p = sz_varset->header->next;
+	while(p!=NULL)
+	{
+		int checkName = strcmp(p->varName, varName);
+		if(checkName==0)
+			return p;
+		p = p->next;
+	}	
+	return NULL;
+}
+
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1)
+{
+	SZ_Variable* v = SZ_searchVar(varName);
+	*r5 = v->r5;
+	*r4 = v->r4;
+	*r3 = v->r3;
+	*r2 = v->r2;
+	*r1 = v->r1;
+	return (void*)v->data;
+}
+
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void SZ_freeVarSet(int mode)
+{
+	free_VarSet_vset(sz_varset, mode);
+}
+
+//free_VarSet will completely destroy the SZ_VarSet, so don't do it until you really don't need it any more!
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void free_VarSet_vset(SZ_VarSet *vset, int mode)
+{
+	if(vset==NULL)
+		return;
+	SZ_Variable *p = vset->header;
+	while(p->next!=NULL)
+	{
+		SZ_Variable *q = p->next;
+		p->next = q->next;
+		if(mode==SZ_MAINTAIN_VAR_DATA)
+			free_Variable_keepOriginalData(q);
+		else if(mode==SZ_DESTROY_WHOLE_VARSET)
+			free_Variable_all(q);
+	}
+	free(sz_varset->header);
+	free(vset);
+}
+
+void free_multisteps(sz_multisteps* multisteps)
+{
+	if(multisteps->hist_data!=NULL)
+		free(multisteps->hist_data);
+	free(multisteps);
+}
+
+inline int checkVarID(unsigned char cur_var_id, unsigned char* var_ids, int var_count)
+{
+	int j = 0;
+	for(j=0;j<var_count;j++)
+	{
+		if(var_ids[j]==cur_var_id)
+			return 1;
+	}
+	return 0;
+}
+
+SZ_Variable* SZ_getVariable(int var_id)
+{
+	SZ_Variable* p = sz_varset->header->next;
+	while(p!=NULL)
+	{
+		if(var_id == p->var_id)
+			return p;
+		p = p->next;
+	}	
+	return NULL;
+} 
diff --git a/deps/SZ/sz/src/callZlib.c b/deps/SZ/sz/src/callZlib.c
new file mode 100644
index 0000000000000000000000000000000000000000..4e4bb6f2729e401d7779bd62c5ee5601775992fc
--- /dev/null
+++ b/deps/SZ/sz/src/callZlib.c
@@ -0,0 +1,527 @@
+/**
+ *  @file callZlib.c
+ *  @author Sheng Di
+ *  @date June, 2016
+ *  @brief gzip compressor code: the interface to call zlib
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+#include <sz.h>
+
+#if MAX_MEM_LEVEL >= 8
+#define DEF_MEM_LEVEL 8
+#else
+#define DEF_MEM_LEVEL MAX_MEM_LEVEL
+#endif
+
+
+#define CHECK_ERR(err, msg) { \
+    if (err != Z_OK && err != Z_STREAM_END) { \
+        fprintf(stderr, "%s error: %d\n", msg, err); \
+        return SZ_NSCS; \
+    } \
+}
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2)
+{
+	if(magic1==104&&magic2==5) //DC+BS
+		return 1;
+	if(magic1==104&&magic2==129) //DC+DC
+		return 1;
+	if(magic1==104&&magic2==222) //DC+BC
+		return 1;		
+	if(magic1==120&&magic2==1) //BC+BS
+		return 1;
+	if(magic1==120&&magic2==94) //BC+? 
+		return 1;		
+	if(magic1==120&&magic2==156) //BC+DC
+		return 1;
+	if(magic1==120&&magic2==218) //BC+BS
+		return 1;
+	return 0;
+}
+
+/*zlib_compress() is only valid for median-size data compression. */
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{	
+	z_stream stream = {0};
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);	
+	unsigned long outSize = estCmpLen;
+    	
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+	int err = compress2(*compressBytes, &outSize, data, dataLength, level);
+	if(err!=Z_OK)
+	{
+		printf("Error: err_code=%d; the reason may be your data size is too large (>=2^32), which cannot be compressed by standalone zlib_compress. Sol: inflace_init, ....\n", err);
+		exit(0);
+	}
+	return outSize;
+}
+
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	unsigned long outSize;
+	
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+
+    stream.next_out = *compressBytes;
+    stream.avail_out = estCmpLen;
+    //stream.avail_out = dataLength*10;
+    //if ((uLong)stream.avail_out != dataLength*10) return Z_BUF_ERROR;
+
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+//	stream.data_type = Z_TEXT;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+	
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+    
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level)
+{
+	unsigned long outSize = 0;
+
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    stream.next_out = compressBytes;
+    stream.avail_out = dataLength;
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+    z_stream c_stream = {0}; /* compression stream */
+    int err = 0;
+
+    c_stream.zalloc = (alloc_func)0;
+    c_stream.zfree = (free_func)0;
+    c_stream.opaque = (voidpf)0;
+
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+    
+    err = deflateInit2(&c_stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    CHECK_ERR(err, "deflateInit");
+
+    uLong estCmpLen = deflateBound(&c_stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+
+    c_stream.next_in  = data;
+    c_stream.next_out = *compressBytes;
+
+    while (c_stream.total_in < dataLength && c_stream.total_out < estCmpLen) {
+        c_stream.avail_in = c_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+        err = deflate(&c_stream, Z_NO_FLUSH);
+        CHECK_ERR(err, "deflate");
+    }
+    /* Finish the stream, still forcing small buffers: */
+    for (;;) {
+        c_stream.avail_out = 1;
+        err = deflate(&c_stream, Z_FINISH);
+        if (err == Z_STREAM_END) break;
+        CHECK_ERR(err, "deflate");
+    }
+
+    err = deflateEnd(&c_stream);
+    CHECK_ERR(err, "deflateEnd");
+    
+    return c_stream.total_out;	
+}
+
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	int ret, flush;
+	unsigned have;
+	z_stream strm;
+	unsigned char* in = data;
+
+	/* allocate deflate state */
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	ret = deflateInit(&strm, level);
+	//int windowBits = 15;
+    //ret = deflateInit2(&strm, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+
+	if (ret != Z_OK)
+		return ret;
+
+	size_t p_size = 0, av_in = 0;
+    uLong estCmpLen = deflateBound(&strm, dataLength);
+   	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+	unsigned char* out = *compressBytes; 
+
+	/* compress until end of file */
+	do {		
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>=dataLength)
+		{
+			av_in = dataLength - (p_size - SZ_ZLIB_BUFFER_SIZE);
+			flush = Z_FINISH;
+		}
+		else
+		{
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+			flush = Z_NO_FLUSH;
+		}
+		strm.avail_in = av_in;
+		strm.next_in = in;
+
+		/* run deflate() on input until output buffer not full, finish
+		   compression if all of source has been read in */
+		do {
+			strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+			strm.next_out = out;
+			ret = deflate(&strm, flush);    /* no bad return value */
+
+			have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+			out += have;
+		} while (strm.avail_out == 0);
+
+		in+=av_in;
+
+		/* done when last data in file processed */
+	} while (flush != Z_FINISH);
+
+	/* clean up and return */
+	(void)deflateEnd(&strm);	
+	
+	return strm.total_out;	
+}
+
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	unsigned long outSize = targetOriSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);	
+	int status = uncompress(*oriData, &outSize, compressBytes, cmpSize); 
+	if(status!=Z_OK)
+	{
+		printf("Error: Zlib decompression error; status=%d\n", status);
+		exit(0);
+	}
+	
+	return outSize;
+}
+
+unsigned long zlib_uncompress2 (unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    z_stream stream = {0};
+
+	unsigned long outSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    stream.zalloc = Z_NULL;
+    stream.zfree = Z_NULL;
+    stream.opaque = Z_NULL;
+//	stream.data_type = Z_TEXT;
+
+    stream.next_in = compressBytes;
+    stream.avail_in = cmpSize;
+    /* Check for source > 64K on 16-bit machine: */
+    if ((unsigned long)stream.avail_in != cmpSize) 
+    {
+		printf("Error: zlib_uncompress2: stream.avail_in != cmpSize");
+		//exit(1);
+		return SZ_NSCS; //-1
+	}
+
+    stream.next_out = *oriData;
+    stream.avail_out = targetOriSize;
+    //if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
+
+    int err = inflateInit(&stream);
+    //int windowBits = 15;
+    //int err = inflateInit2(&stream, windowBits);
+    if (err != Z_OK)
+    {
+		printf("Error: zlib_uncompress2: err != Z_OK\n");
+		return SZ_NSCS;
+	}
+
+    err = inflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        inflateEnd(&stream);
+        if (err == Z_NEED_DICT || (err == Z_BUF_ERROR && stream.avail_in == 0))
+            return Z_DATA_ERROR;
+        return err;
+    }
+    outSize = stream.total_out;
+    inflateEnd(&stream);
+    return outSize;
+}
+
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int status;
+	z_stream z_strm; /* decompression stream */
+	
+	size_t nalloc = 65536*4;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	memset(&z_strm, 0, sizeof(z_strm));
+
+
+    /*d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;*/
+
+	z_strm.next_in  = compressBytes;
+	z_strm.avail_in = 0;
+	z_strm.next_out = *oriData;
+	z_strm.avail_out = targetOriSize;
+	
+	status = inflateInit(&z_strm);
+	CHECK_ERR(status, "inflateInit");
+	
+	do{
+		z_strm.avail_in = z_strm.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */		
+		/* Uncompress some data */
+		status = inflate(&z_strm, Z_SYNC_FLUSH);
+		
+		/* Check if we are done uncompressing data */
+		if (Z_STREAM_END==status)
+			break;  /*done*/				
+
+		if (Z_OK!=status) {
+			(void)inflateEnd(&z_strm);
+			printf("Error: inflate() failed\n");
+			exit(0);
+		}	
+		else
+		{
+			/* If we're not done and just ran out of buffer space, get more */
+			if(0 == z_strm.avail_out) {
+				void *new_outbuf;         /* Pointer to new output buffer */
+
+				/* Allocate a buffer twice as big */
+				nalloc *= 2;
+				if(NULL == (new_outbuf = realloc(*oriData, nalloc))) {
+					(void)inflateEnd(&z_strm);
+					printf("Error: memory allocation failed for deflate uncompression\n");
+					exit(0);
+				} /* end if */
+				*oriData = new_outbuf;
+
+				/* Update pointers to buffer for next set of uncompressed data */
+				z_strm.next_out = (*oriData) + z_strm.total_out;
+				z_strm.avail_out = (uInt)(nalloc - z_strm.total_out);
+			} /* end if */			
+		} /* end else*/
+	}while(status==Z_OK);
+
+	status = inflateEnd(&z_strm);
+	CHECK_ERR(status, "inflateEnd");
+
+	return z_strm.total_out;
+}
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    int ret;
+    unsigned int have;
+    z_stream strm;
+    unsigned char *in = compressBytes;
+    unsigned char *out;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	out = *oriData;
+
+    /* allocate inflate state */
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+    strm.avail_in = 0;
+    strm.next_in = Z_NULL;
+    ret = inflateInit(&strm);
+    if (ret != Z_OK)
+	{
+        return ret;
+	}
+
+	size_t p_size = 0, av_in = 0;
+    /* decompress until deflate stream ends or end of file */
+    do {
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>cmpSize)
+			av_in = cmpSize - (p_size - SZ_ZLIB_BUFFER_SIZE);
+		else
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+		strm.avail_in = av_in;
+        
+        if (strm.avail_in == 0)
+            break;
+        strm.next_in = in;
+
+        /* run inflate() on input until output buffer not full */
+        do {
+            strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+            strm.next_out = out;
+            ret = inflate(&strm, Z_NO_FLUSH);
+            //assert(ret != Z_STREAM_ERROR);  /* state not clobbered */
+            switch (ret) {
+            case Z_NEED_DICT:
+                ret = Z_DATA_ERROR;     /* and fall through */
+            case Z_DATA_ERROR:
+            case Z_MEM_ERROR:
+                (void)inflateEnd(&strm);
+                return ret;
+            }
+            have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+            
+            out += have;
+
+        } while (strm.avail_out == 0);
+		
+		in+=av_in;
+        /* done when inflate() says it's done */
+    } while (ret != Z_STREAM_END);
+
+    /* clean up and return */
+    (void)inflateEnd(&strm);
+    
+    return strm.total_out;	
+}
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData)
+{
+	int err;
+	unsigned long targetOriSize = 65536;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		if(err<0)
+			break;
+	}
+	
+	if(err<0)
+		return d_stream.total_out;
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
+
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int err;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		CHECK_ERR(err, "inflate");
+	}
+	
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
diff --git a/deps/SZ/sz/src/conf.c b/deps/SZ/sz/src/conf.c
new file mode 100644
index 0000000000000000000000000000000000000000..02198dbd6ce573fbb8759b7f200772cf7ca2a050
--- /dev/null
+++ b/deps/SZ/sz/src/conf.c
@@ -0,0 +1,459 @@
+/**
+ *  @file   conf.c
+ *  @author Sheng Di (sdi1@anl.gov or disheng222@gmail.com)
+ *  @date   2015.
+ *  @brief  Configuration loading functions for the SZ library.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <math.h>
+#include "string.h"
+#include "sz.h"
+#include "iniparser.h"
+#include "Huffman.h"
+#include "pastri.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads the configuration given in the configuration file.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration given in the SZ configuration
+    file and sets other required parameters.
+
+ **/
+ 
+/*struct node_t *pool;
+node *qqq;
+node *qq;
+int n_nodes = 0, qend;
+unsigned long **code;
+unsigned char *cout;
+int n_inode;*/ 
+ 
+unsigned int roundUpToPowerOf2(unsigned int base)
+{
+  base -= 1;
+
+  base = base | (base >> 1);
+  base = base | (base >> 2);
+  base = base | (base >> 4);
+  base = base | (base >> 8);
+  base = base | (base >> 16);
+
+  return base + 1;
+} 
+ 
+void updateQuantizationInfo(int quant_intervals)
+{
+	exe_params->intvCapacity = quant_intervals;
+	exe_params->intvRadius = quant_intervals/2;
+} 
+ 
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range)
+{
+	double v1 = psnr + 10 * log10(1-2.0/3.0*threshold);
+	double v2 = v1/(-20);
+	double v3 = pow(10, v2);
+	return value_range * v3;
+} 
+
+double computeABSErrBoundFromNORM_ERR(double normErr, size_t nbEle)
+{
+	return sqrt(3.0/nbEle)*normErr;
+} 
+
+ 
+/*-------------------------------------------------------------------------*/
+/**
+ * 
+ * 
+ * @return the status of loading conf. file: 1 (success) or 0 (error code);
+ * */
+int SZ_ReadConf(const char* sz_cfgFile) {
+    // Check access to SZ configuration file and load dictionary
+    //record the setting in confparams_cpr
+    confparams_cpr = (sz_params*)malloc(sizeof(sz_params));    
+    exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+    
+    int x = 1;
+    char sol_name[256];
+    char *modeBuf;
+    char *errBoundMode;
+    char *endianTypeString;
+    dictionary *ini;
+    char *par;
+
+	char *y = (char*)&x;
+	
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+    
+    confparams_cpr->plus_bits = 3;
+    
+    if(sz_cfgFile == NULL)
+    {
+		dataEndianType = LITTLE_ENDIAN_DATA;
+		confparams_cpr->sol_ID = SZ;
+		confparams_cpr->max_quant_intervals = 65536;
+		confparams_cpr->maxRangeRadius = confparams_cpr->max_quant_intervals/2;
+				
+		exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+		exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+		
+		confparams_cpr->quantization_intervals = 0;
+		exe_params->optQuantMode = 1;
+		confparams_cpr->predThreshold = 0.99;
+		confparams_cpr->sampleDistance = 100;
+		
+		confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR; //other option: GZIP_COMPRESSOR;
+		if(confparams_cpr->losslessCompressor==ZSTD_COMPRESSOR)
+			confparams_cpr->gzipMode = 3; //fast mode
+		else
+			confparams_cpr->gzipMode = 1; //high speed mode
+		
+		confparams_cpr->errorBoundMode = PSNR;
+		confparams_cpr->psnr = 90;
+		confparams_cpr->absErrBound = 1E-4;
+		confparams_cpr->relBoundRatio = 1E-4;
+		confparams_cpr->accelerate_pw_rel_compression = 1;
+		
+		confparams_cpr->pw_relBoundRatio = 1E-3;
+		confparams_cpr->segment_size = 36;
+		
+		confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		
+		confparams_cpr->snapshotCmprStep = 5;
+		
+		confparams_cpr->withRegression = SZ_WITH_LINEAR_REGRESSION;
+	
+		confparams_cpr->randomAccess = 0; //0: no random access , 1: support random access
+	
+		confparams_cpr->protectValueRange = 0;
+	
+		return SZ_SCES;
+	}
+    
+    if (access(sz_cfgFile, F_OK) != 0)
+    {
+        printf("[SZ] Configuration file NOT accessible.\n");
+        return SZ_NSCS;
+    }
+    
+    //printf("[SZ] Reading SZ configuration file (%s) ...\n", sz_cfgFile);    
+    ini = iniparser_load(sz_cfgFile);
+    if (ini == NULL)
+    {
+        printf("[SZ] Iniparser failed to parse the conf. file.\n");
+        return SZ_NSCS;
+    }
+
+	endianTypeString = iniparser_getstring(ini, "ENV:dataEndianType", "LITTLE_ENDIAN_DATA");
+	if(strcmp(endianTypeString, "LITTLE_ENDIAN_DATA")==0)
+		dataEndianType = LITTLE_ENDIAN_DATA;
+	else if(strcmp(endianTypeString, "BIG_ENDIAN_DATA")==0)
+		dataEndianType = BIG_ENDIAN_DATA;
+	else
+	{
+		printf("Error: Wrong dataEndianType: please set it correctly in sz.config.\n");
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+
+	// Reading/setting detection parameters
+	
+	par = iniparser_getstring(ini, "ENV:sol_name", NULL);
+	snprintf(sol_name, 256, "%s", par);
+	
+    if(strcmp(sol_name, "SZ")==0)
+		confparams_cpr->sol_ID = SZ;
+	else if(strcmp(sol_name, "PASTRI")==0)
+		confparams_cpr->sol_ID = PASTRI;
+	else if(strcmp(sol_name, "SZ_Transpose")==0)
+		confparams_cpr->sol_ID = SZ_Transpose;
+	else{
+		printf("[SZ] Error: wrong solution name (please check sz.config file), sol=%s\n", sol_name);
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+	
+	if(confparams_cpr->sol_ID==SZ || confparams_cpr->sol_ID==SZ_Transpose)
+	{
+		int max_quant_intervals = iniparser_getint(ini, "PARAMETER:max_quant_intervals", 65536);
+		confparams_cpr->max_quant_intervals = max_quant_intervals;
+		
+		int quantization_intervals = (int)iniparser_getint(ini, "PARAMETER:quantization_intervals", 0);
+		confparams_cpr->quantization_intervals = quantization_intervals;
+		if(quantization_intervals>0)
+		{
+			updateQuantizationInfo(quantization_intervals);
+			confparams_cpr->max_quant_intervals = max_quant_intervals = quantization_intervals;
+			exe_params->optQuantMode = 0;
+		}
+		else //==0
+		{
+			confparams_cpr->maxRangeRadius = max_quant_intervals/2;
+
+			exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+			exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+			
+			exe_params->optQuantMode = 1;
+		}
+		
+		if(quantization_intervals%2!=0)
+		{
+			printf("Error: quantization_intervals must be an even number!\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->predThreshold = (float)iniparser_getdouble(ini, "PARAMETER:predThreshold", 0);
+		confparams_cpr->sampleDistance = (int)iniparser_getint(ini, "PARAMETER:sampleDistance", 0);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:szMode", NULL);
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}
+		else if(strcmp(modeBuf, "SZ_BEST_SPEED")==0)
+			confparams_cpr->szMode = SZ_BEST_SPEED;
+		else if(strcmp(modeBuf, "SZ_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_DEFAULT_COMPRESSION;
+		else if(strcmp(modeBuf, "SZ_BEST_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		else
+		{
+			printf("[SZ] Error: Wrong szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:losslessCompressor", "ZSTD_COMPRESSOR");
+		if(strcmp(modeBuf, "GZIP_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = GZIP_COMPRESSOR;
+		else if(strcmp(modeBuf, "ZSTD_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR;
+		else
+		{
+			printf("[SZ] Error: Wrong losslessCompressor setting (please check sz.config file)\n");\
+			printf("No Such a lossless compressor: %s\n", modeBuf);
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}		
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:withLinearRegression", "YES");
+		if(strcmp(modeBuf, "YES")==0 || strcmp(modeBuf, "yes")==0)
+			confparams_cpr->withRegression = SZ_WITH_LINEAR_REGRESSION;
+		else
+			confparams_cpr->withRegression = SZ_NO_REGRESSION;
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:gzipMode", "Gzip_BEST_SPEED");
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Gzip mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Gzip_NO_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 0;
+		else if(strcmp(modeBuf, "Gzip_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Gzip_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 9;
+		else if(strcmp(modeBuf, "Gzip_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = -1;
+		else
+		{
+			printf("[SZ] Error: Wrong gzip Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:zstdMode", "Zstd_HIGH_SPEED");		
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Zstd mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Zstd_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Zstd_HIGH_SPEED")==0)
+			confparams_cpr->gzipMode = 3;
+		else if(strcmp(modeBuf, "Zstd_HIGH_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 19;
+		else if(strcmp(modeBuf, "Zstd_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 22;			
+		else if(strcmp(modeBuf, "Zstd_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 3;
+		else
+		{
+			printf("[SZ] Error: Wrong zstd Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}		
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:protectValueRange", "YES");
+		if(strcmp(modeBuf, "YES")==0)
+			confparams_cpr->protectValueRange = 1;
+		else
+			confparams_cpr->protectValueRange = 0;
+		
+		confparams_cpr->randomAccess = (int)iniparser_getint(ini, "PARAMETER:randomAccess", 0);
+		
+		//TODO
+		confparams_cpr->snapshotCmprStep = (int)iniparser_getint(ini, "PARAMETER:snapshotCmprStep", 5);
+				
+		errBoundMode = iniparser_getstring(ini, "PARAMETER:errorBoundMode", NULL);
+		if(errBoundMode==NULL)
+		{
+			printf("[SZ] Error: Null error bound setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;				
+		}
+		else if(strcmp(errBoundMode,"ABS")==0||strcmp(errBoundMode,"abs")==0)
+			confparams_cpr->errorBoundMode=ABS;
+		else if(strcmp(errBoundMode, "REL")==0||strcmp(errBoundMode,"rel")==0)
+			confparams_cpr->errorBoundMode=REL;
+		else if(strcmp(errBoundMode, "VR_REL")==0||strcmp(errBoundMode, "vr_rel")==0)
+			confparams_cpr->errorBoundMode=REL;
+		else if(strcmp(errBoundMode, "ABS_AND_REL")==0||strcmp(errBoundMode, "abs_and_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_REL")==0||strcmp(errBoundMode, "abs_or_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_REL;
+		else if(strcmp(errBoundMode, "PW_REL")==0||strcmp(errBoundMode, "pw_rel")==0)
+			confparams_cpr->errorBoundMode=PW_REL;
+		else if(strcmp(errBoundMode, "PSNR")==0||strcmp(errBoundMode, "psnr")==0)
+			confparams_cpr->errorBoundMode=PSNR;
+		else if(strcmp(errBoundMode, "ABS_AND_PW_REL")==0||strcmp(errBoundMode, "abs_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_PW_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_PW_REL")==0||strcmp(errBoundMode, "abs_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_PW_REL;
+		else if(strcmp(errBoundMode, "REL_AND_PW_REL")==0||strcmp(errBoundMode, "rel_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_AND_PW_REL;
+		else if(strcmp(errBoundMode, "REL_OR_PW_REL")==0||strcmp(errBoundMode, "rel_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_OR_PW_REL;
+		else if(strcmp(errBoundMode, "NORM")==0||strcmp(errBoundMode, "norm")==0)
+			confparams_cpr->errorBoundMode=NORM;
+		else
+		{
+			printf("[SZ] Error: Wrong error bound mode (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->absErrBound = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 0);
+		confparams_cpr->relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:relBoundRatio", 0);
+		confparams_cpr->psnr = (double)iniparser_getdouble(ini, "PARAMETER:psnr", 0);
+		confparams_cpr->normErr = (double)iniparser_getdouble(ini, "PARAMETER:normErr", 0);
+		confparams_cpr->pw_relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:pw_relBoundRatio", 0);
+		confparams_cpr->segment_size = (int)iniparser_getint(ini, "PARAMETER:segment_size", 0);
+		confparams_cpr->accelerate_pw_rel_compression = (int)iniparser_getint(ini, "PARAMETER:accelerate_pw_rel_compression", 1);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:pwr_type", "MIN");
+		
+		if(strcmp(modeBuf, "MIN")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		else if(strcmp(modeBuf, "AVG")==0)
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+		else if(strcmp(modeBuf, "MAX")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MAX_TYPE;
+		else if(modeBuf!=NULL)
+		{
+			printf("[SZ] Error: Wrong pwr_type setting (please check sz.config file).\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		else //by default
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+    
+		//initialization for Huffman encoding
+		//SZ_Reset();	
+	}
+	else if(confparams_cpr->sol_ID == PASTRI)
+	{//load parameters for PSTRI
+		pastri_par.bf[0] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_0", 0);		
+		pastri_par.bf[1] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_1", 0);		
+		pastri_par.bf[2] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_2", 0);		
+		pastri_par.bf[3] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_3", 0);
+		pastri_par.numBlocks = (int)iniparser_getint(ini, "PARAMETER:numBlocks", 0);		
+		confparams_cpr->absErrBound = pastri_par.originalEb = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 1E-3);
+	}
+	
+    iniparser_freedict(ini);
+    return SZ_SCES;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads and tests the configuration given.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration file. Then test that the
+    configuration parameters are correct (including directories).
+
+ **/
+/*-------------------------------------------------------------------------*/
+int SZ_LoadConf(const char* sz_cfgFile) {
+    int res = SZ_ReadConf(sz_cfgFile);
+    if (res != SZ_SCES)
+    {
+        printf("[SZ] ERROR: Impossible to read configuration.\n");
+        return SZ_NSCS;
+    }
+    return SZ_SCES;
+}
+
+int checkVersion(char* version)
+{
+	int i = 0;
+	for(;i<3;i++)
+		if(version[i]!=versionNumber[i])
+			return 0;
+	return 1;
+}
+
+inline int computeVersion(int major, int minor, int revision)
+{
+	return major*10000+minor*100+revision;
+}
+
+int checkVersion2(char* version)
+{
+	int major = version[0];
+	int minor = version[1];
+	int revision = version[2];
+	
+	int preVersion = 20108;
+	int givenVersion = computeVersion(major, minor, revision);
+	//int currentVersion = computeVersion(SZ_VER_MAJOR, SZ_VER_MINOR, SZ_VER_REVISION);
+	if(givenVersion < preVersion) //only for old version (older than 2.1.8), we will check whether version is consistent exactly.
+		return checkVersion(version);
+	return 1;
+}
+
+void initSZ_TSC()
+{
+	sz_tsc = (sz_tsc_metadata*)malloc(sizeof(sz_tsc_metadata));
+	memset(sz_tsc, 0, sizeof(sz_tsc_metadata));
+	/*sprintf(sz_tsc->metadata_filename, "sz_tsc_metainfo.txt");
+	sz_tsc->metadata_file = fopen(sz_tsc->metadata_filename, "wb");
+	if (sz_tsc->metadata_file == NULL)
+	{
+		printf("Failed to open sz_tsc_metainfo.txt file for writing metainfo.\n");
+		exit(1);
+	}
+	fputs("#metadata of the time-step based compression\n", sz_tsc->metadata_file);	*/
+}
+
+/*double fabs(double value)
+{
+	if(value<0)
+		return -value;
+	else
+		return value;
+}*/
diff --git a/deps/SZ/sz/src/dataCompression.c b/deps/SZ/sz/src/dataCompression.c
new file mode 100644
index 0000000000000000000000000000000000000000..0051c542158010f1cfd9896b664a39a0181eced1
--- /dev/null
+++ b/deps/SZ/sz/src/dataCompression.c
@@ -0,0 +1,980 @@
+/**
+ *  @file double_compression.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date April, 2016
+ *  @brief Compression Technique for double array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "sz.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "CompressElement.h"
+#include "dataCompression.h"
+
+int computeByteSizePerIntValue(long valueRangeSize)
+{
+	if(valueRangeSize<=256)
+		return 1;
+	else if(valueRangeSize<=65536)
+		return 2;
+	else if(valueRangeSize<=4294967296) //2^32
+		return 4;
+	else
+		return 8;
+}
+
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize)
+{
+	size_t i = 0;
+	long max = 0, min = 0;
+
+	if(dataType==SZ_UINT8)
+	{
+		unsigned char* data = (unsigned char*)oriData;
+		unsigned char data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT8)
+	{
+		char* data = (char*)oriData;
+		char data_;
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		unsigned short* data = (unsigned short*)oriData;
+		unsigned short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT16)
+	{ 
+		short* data = (short*)oriData;
+		short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		unsigned int* data = (unsigned int*)oriData;
+		unsigned int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int* data = (int*)oriData;
+		int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		unsigned long* data = (unsigned long*)oriData;
+		unsigned long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT64)
+	{
+		long* data = (long *)oriData;
+		long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+
+	*valueRangeSize = max - min;
+	return min;	
+}
+
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue)
+{
+	size_t i = 0;
+	float min = oriData[0];
+	float max = min;
+	for(i=1;i<size;i++)
+	{
+		float data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+float computeRangeSize_float_MSST19(float* oriData, size_t size, float* valueRangeSize, float* medianValue, unsigned char * signs, bool* positive, float* nearZero)
+{
+    size_t i = 0;
+    float min = oriData[0];
+    float max = min;
+    *nearZero = min;
+
+    for(i=1;i<size;i++)
+    {
+        float data = oriData[i];
+        if(data <0){
+            signs[i] = 1;
+            *positive = false;
+        }
+        if(oriData[i] != 0 && fabsf(oriData[i]) < fabsf(*nearZero)){
+            *nearZero = oriData[i];
+        }
+        if(min>data)
+            min = data;
+        else if(max<data)
+            max = data;
+    }
+
+    *valueRangeSize = max - min;
+    *medianValue = min + *valueRangeSize/2;
+    return min;
+}
+
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue)
+{
+	size_t i = 0;
+	double min = oriData[0];
+	double max = min;
+	for(i=1;i<size;i++)
+	{
+		double data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+	
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero)
+{
+    size_t i = 0;
+    double min = oriData[0];
+    double max = min;
+    *nearZero = min;
+
+    for(i=1;i<size;i++)
+    {
+        double data = oriData[i];
+        if(data <0){
+            signs[i] = 1;
+            *positive = false;
+        }
+        if(oriData[i] != 0 && fabs(oriData[i]) < fabs(*nearZero)){
+            *nearZero = oriData[i];
+        }
+        if(min>data)
+            min = data;
+        else if(max<data)
+            max = data;
+    }
+
+    *valueRangeSize = max - min;
+    *medianValue = min + *valueRangeSize/2;
+    return min;
+}
+
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	float min = oriData[index_start];
+	float max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		float data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+double computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	double min = oriData[index_start];
+	double max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		double data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+double min_d(double a, double b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+double max_d(double a, double b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+float min_f(float a, float b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+float max_f(float a, float b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = -1;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+	
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+	
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+inline void symTransform_2bytes(unsigned char data[2])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[1];
+	data[1] = tmp;
+}
+
+inline void symTransform_4bytes(unsigned char data[4])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[3];
+	data[3] = tmp;
+
+	tmp = data[1];
+	data[1] = data[2];
+	data[2] = tmp;
+}
+
+inline void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	float normValue = tgtValue - medianValue;
+
+	lfloat lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 32 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+	
+	int tmp_int = lfBuf.ivalue;
+	intToBytes_bigEndian(vce->curBytes, tmp_int);
+		
+	lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+	
+	//float tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_int;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleFloatValue_MSST19(FloatValueCompressElement *vce, float tgtValue, float precision, int reqLength, int reqBytesLength, int resiBitsLength)
+{
+    float normValue = tgtValue;
+
+    lfloat lfBuf;
+    lfBuf.value = normValue;
+
+    int ignBytesLength = 32 - reqLength;
+    if(ignBytesLength<0)
+        ignBytesLength = 0;
+
+    int tmp_int = lfBuf.ivalue;
+    intToBytes_bigEndian(vce->curBytes, tmp_int);
+
+    lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+
+    //float tmpValue = lfBuf.value;
+
+    vce->data = lfBuf.value;
+    vce->curValue = tmp_int;
+    vce->reqBytesLength = reqBytesLength;
+    vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleDoubleValue_MSST19(DoubleValueCompressElement *vce, double tgtValue, double precision, int reqLength, int reqBytesLength, int resiBitsLength)
+{
+    ldouble lfBuf;
+    lfBuf.value = tgtValue;
+
+    int ignBytesLength = 64 - reqLength;
+    if(ignBytesLength<0)
+        ignBytesLength = 0;
+
+    long tmp_long = lfBuf.lvalue;
+    longToBytes_bigEndian(vce->curBytes, tmp_long);
+
+    lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength) << ignBytesLength;
+
+    //float tmpValue = lfBuf.value;
+
+    vce->data = lfBuf.value;
+    vce->curValue = tmp_long;
+    vce->reqBytesLength = reqBytesLength;
+    vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	double normValue = tgtValue - medianValue;
+
+	ldouble lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 64 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+
+	long tmp_long = lfBuf.lvalue;
+	longToBytes_bigEndian(vce->curBytes, tmp_long);
+				
+	lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength)<<ignBytesLength;
+	
+	//double tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_long;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<8;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+inline int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<4;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+//TODO double-check the correctness...
+inline void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce)
+{
+	int i;
+	int leadByteLength = lce->leadingZeroBytes;
+	addDIA_Data(exactLeadNumArray, leadByteLength);
+	unsigned char* intMidBytes = lce->integerMidBytes;
+	int integerMidBytesLength = lce->integerMidBytes_Length;
+	int resMidBitsLength = lce->resMidBitsLength;
+	if(intMidBytes!=NULL||resMidBitsLength!=0)
+	{
+		if(intMidBytes!=NULL)
+			for(i = 0;i<integerMidBytesLength;i++)
+				addDBA_Data(exactMidByteArray, intMidBytes[i]);
+		if(resMidBitsLength!=0)
+			addDIA_Data(resiBitArray, lce->residualMidBits);
+	}
+}
+
+/**
+ * @deprecated
+ * @return: the length of the coefficient array.
+ * */
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status)
+{
+	size_t size = 0;
+	switch(dimension)
+	{
+		case 1:
+			switch(layers)
+			{
+				case 1:
+					*coeff_array = (int*)malloc(sizeof(int));
+					(*coeff_array)[0] = 1;
+					size = 1;
+					break;
+				case 2:
+					*coeff_array = (int*)malloc(2*sizeof(int));
+					(*coeff_array)[0] = 2;
+					(*coeff_array)[1] = -1;
+					size = 2;
+					break;
+				case 3:
+					*coeff_array = (int*)malloc(3*sizeof(int));
+					(*coeff_array)[0] = 3;
+					(*coeff_array)[1] = -3;
+					(*coeff_array)[2] = 1;
+					break;
+			}	
+			break;
+		case 2:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}				
+			break;
+		case 3:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}			
+			break;
+		default:
+			printf("Error: dimension must be no greater than 3 in the current version.\n");
+			*status = SZ_DERR;
+	}
+	*status = SZ_SCES;
+	return size;
+}
+
+int computeBlockEdgeSize_2D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i>segmentSize)
+			break;
+	}
+	return i;
+	//return (int)(sqrt(segmentSize)+1);
+}
+
+int computeBlockEdgeSize_3D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i*i>segmentSize)
+			break;
+	}
+	return i;	
+	//return (int)(pow(segmentSize, 1.0/3)+1);
+}
+
+//convert random-access version based bytes to output bytes
+int initRandomAccessBytes(unsigned char* raBytes)
+{
+	int k = 0, i = 0;
+	for (i = 0; i < 3; i++)//3
+		raBytes[k++] = versionNumber[i];
+	int sameByte = 0x80; //indicating this is regression-based compression mode
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->randomAccess)
+		sameByte = (unsigned char) (sameByte | 0x02); // 00000010, random access
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //00000100, protect value range
+
+	raBytes[k++] = sameByte;
+
+	convertSZParamsToBytes(confparams_cpr, &(raBytes[k]));
+	if(confparams_cpr->dataType==SZ_FLOAT)
+		k = k + MetaDataByteLength;
+	else if(confparams_cpr->dataType==SZ_DOUBLE)
+		k = k + MetaDataByteLength_double;
+
+	return k;
+}
+
+//The following functions are float-precision version of dealing with the unpredictable data points 
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData)
+{
+	float valueRangeSize;
+	
+	computeRangeSize_float(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_float(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		float normValue = oriData[i] - *medianValue;
+
+		lfloat lfBuf;
+		lfBuf.value = normValue;
+				
+		int ignBytesLength = 32 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+		
+		//float tmpValue = lfBuf.value;
+		
+		decData[i] = lfBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param float* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[4] = {0,0,0,0};	
+
+	//allocate memory for vce and lce
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleFloatValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData)
+{
+	*decData = (float*)malloc(nbEle*sizeof(float));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	float exactData = 0;
+	unsigned char preBytes[4] = {0,0,0,0};
+	unsigned char curBytes[4];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}	
+}
+
+//double-precision version of dealing with unpredictable data points in sz 2.0
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData)
+{
+	double valueRangeSize;
+	
+	computeRangeSize_double(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_double(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_double(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		double normValue = oriData[i] - *medianValue;
+
+		ldouble ldBuf;
+		ldBuf.value = normValue;
+				
+		int ignBytesLength = 64 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		ldBuf.lvalue = (ldBuf.lvalue >> ignBytesLength) << ignBytesLength;
+		
+		decData[i] = ldBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param double* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[8] = {0,0,0,0,0,0,0,0};	
+
+	//allocate memory for vce and lce
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleDoubleValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData)
+{
+	*decData = (double*)malloc(nbEle*sizeof(double));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	double exactData = 0;
+	unsigned char preBytes[8] = {0,0,0,0,0,0,0,0};
+	unsigned char curBytes[8];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+}
diff --git a/deps/SZ/sz/src/dictionary.c b/deps/SZ/sz/src/dictionary.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f0f5cfa63a862fa515e9e2d21674ad61b7c2f6f
--- /dev/null
+++ b/deps/SZ/sz/src/dictionary.c
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.c
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+#include "dictionary.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/** Maximum value size for integers and doubles. */
+#define MAXVALSZ    1024
+
+/** Minimal allocated number of entries in a dictionary */
+#define DICTMINSZ   128
+
+/** Invalid key token */
+#define DICT_INVALID_KEY    ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                            Private functions
+ ---------------------------------------------------------------------------*/
+
+/* Doubles the allocated size associated to a pointer */
+/* 'size' is the current allocated size. */
+static void * mem_double(void * ptr, int size)
+{
+    void * newptr ;
+ 
+    newptr = calloc(2*size, 1);
+    if (newptr==NULL) {
+        return NULL ;
+    }
+    memcpy(newptr, ptr, size);
+    free(ptr);
+    return newptr ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Duplicate a string
+  @param    s String to duplicate
+  @return   Pointer to a newly allocated string, to be freed with free()
+
+  This is a replacement for strdup(). This implementation is provided
+  for systems that do not have it.
+ */
+/*--------------------------------------------------------------------------*/
+static char * xstrdup(const char * s)
+{
+    char * t ;
+    if (!s)
+        return NULL ;
+    t = (char*)malloc(strlen(s)+1) ;
+    if (t) {
+        strcpy(t,s);
+    }
+    return t ;
+}
+
+/*---------------------------------------------------------------------------
+                            Function codes
+ ---------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key)
+{
+    int         len ;
+    unsigned    hash ;
+    int         i ;
+
+    len = strlen(key);
+    for (hash=0, i=0 ; i<len ; i++) {
+        hash += (unsigned)key[i] ;
+        hash += (hash<<10);
+        hash ^= (hash>>6) ;
+    }
+    hash += (hash <<3);
+    hash ^= (hash >>11);
+    hash += (hash <<15);
+    return hash ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size)
+{
+    dictionary  *   d ;
+
+    /* If no size was specified, allocate space for DICTMINSZ */
+    if (size<DICTMINSZ) size=DICTMINSZ ;
+
+    if (!(d = (dictionary *)calloc(1, sizeof(dictionary)))) {
+        return NULL;
+    }
+    d->size = size ;
+    d->val  = (char **)calloc(size, sizeof(char*));
+    d->key  = (char **)calloc(size, sizeof(char*));
+    d->hash = (unsigned int *)calloc(size, sizeof(unsigned));
+    return d ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * d)
+{
+    int     i ;
+
+    if (d==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]!=NULL)
+            free(d->key[i]);
+        if (d->val[i]!=NULL)
+            free(d->val[i]);
+    }
+    free(d->val);
+    free(d->key);
+    free(d->hash);
+    free(d);
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def)
+{
+    unsigned    hash ;
+    int         i ;
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                return d->val[i] ;
+            }
+        }
+    }
+    return def ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * d, const char * key, const char * val)
+{
+    int         i ;
+    unsigned    hash ;
+
+    if (d==NULL || key==NULL) return -1 ;
+    
+    /* Compute hash for this key */
+    hash = dictionary_hash(key) ;
+    /* Find if value is already in dictionary */
+    if (d->n>0) {
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            if (hash==d->hash[i]) { /* Same hash value */
+                if (!strcmp(key, d->key[i])) {   /* Same key */
+                    /* Found a value: modify and return */
+                    if (d->val[i]!=NULL)
+                        free(d->val[i]);
+                    d->val[i] = val ? xstrdup(val) : NULL ;
+                    /* Value has been modified: return */
+                    return 0 ;
+                }
+            }
+        }
+    }
+    /* Add a new value */
+    /* See if dictionary needs to grow */
+    if (d->n==d->size) {
+
+        /* Reached maximum size: reallocate dictionary */
+        d->val  = (char **)mem_double(d->val,  d->size * sizeof(char*)) ;
+        d->key  = (char **)mem_double(d->key,  d->size * sizeof(char*)) ;
+        d->hash = (unsigned int *)mem_double(d->hash, d->size * sizeof(unsigned)) ;
+        if ((d->val==NULL) || (d->key==NULL) || (d->hash==NULL)) {
+            /* Cannot grow dictionary */
+            return -1 ;
+        }
+        /* Double size */
+        d->size *= 2 ;
+    }
+
+    /* Insert key in the first empty slot. Start at d->n and wrap at
+       d->size. Because d->n < d->size this will necessarily
+       terminate. */
+    for (i=d->n ; d->key[i] ; ) {
+        if(++i == d->size) i = 0;
+    }
+    /* Copy key */
+    d->key[i]  = xstrdup(key);
+    d->val[i]  = val ? xstrdup(val) : NULL ;
+    d->hash[i] = hash;
+    d->n ++ ;
+    return 0 ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key)
+{
+    unsigned    hash ;
+    int         i ;
+
+    if (key == NULL) {
+        return;
+    }
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                /* Found key */
+                break ;
+            }
+        }
+    }
+    if (i>=d->size)
+        /* Key not found */
+        return ;
+
+    free(d->key[i]);
+    d->key[i] = NULL ;
+    if (d->val[i]!=NULL) {
+        free(d->val[i]);
+        d->val[i] = NULL ;
+    }
+    d->hash[i] = 0 ;
+    d->n -- ;
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out)
+{
+    int     i ;
+
+    if (d==NULL || out==NULL) return ;
+    if (d->n<1) {
+        fprintf(out, "empty dictionary\n");
+        return ;
+    }
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]) {
+            fprintf(out, "%20s\t[%s]\n",
+                    d->key[i],
+                    d->val[i] ? d->val[i] : "UNDEF");
+        }
+    }
+    return ;
+}
+
+
+/* Test code */
+#ifdef TESTDIC
+#define NVALS 20000
+int main(int argc, char *argv[])
+{
+    dictionary  *   d ;
+    char    *   val ;
+    int         i ;
+    char        cval[90] ;
+
+    /* Allocate dictionary */
+    printf("allocating...\n");
+    d = dictionary_new(0);
+    
+    /* Set values in dictionary */
+    printf("setting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_set(d, cval, "salut");
+    }
+    printf("getting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        val = dictionary_get(d, cval, DICT_INVALID_KEY);
+        if (val==DICT_INVALID_KEY) {
+            printf("cannot get value for key [%s]\n", cval);
+        }
+    }
+    printf("unsetting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_unset(d, cval);
+    }
+    if (d->n != 0) {
+        printf("error deleting values\n");
+    }
+    printf("deallocating...\n");
+    dictionary_del(d);
+    return 0 ;
+}
+#endif
+/* vim: set ts=4 et sw=4 tw=75 */
diff --git a/deps/SZ/sz/src/exafelSZ.c b/deps/SZ/sz/src/exafelSZ.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc4e52bf508fe1eddf06ea827f9fa59ae38e27e0
--- /dev/null
+++ b/deps/SZ/sz/src/exafelSZ.c
@@ -0,0 +1,597 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sz.h"
+
+void exafelSZ_params_process(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  pr->binnedRows=(rows+pr->binSize-1)/pr->binSize;
+  pr->binnedCols=(cols+pr->binSize-1)/pr->binSize;
+  
+  pr->peakRadius=(pr->peakSize-1)/2;
+}
+
+void exafelSZ_params_checkDecomp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  if(pr->calibPanel==NULL){
+    printf("ERROR: calibPanel is NULL : calibPanel=%ld\n",(long)pr->calibPanel);
+    assert(0);
+  }
+  if(pr->binSize<1 || pr->tolerance<0 || pr->szDim<1 || pr->szDim>3){
+    printf("ERROR: Something wrong with the following:\n");
+    printf("binSize=%d\n",(int)pr->binSize);
+    printf("tolerance=%d\n",(int)pr->tolerance);
+    printf("szDim=%d\n",(int)pr->szDim);
+    assert(0);
+  }
+  if(!(pr->peakSize%2)){
+    printf("ERROR: peakSize = %d cannot be even. It must be odd!\n",(int)pr->peakSize);
+    assert(0);
+  }  
+  //if(nEvents<1 || panels<1 || rows<1 || cols<1){
+  if(panels<1 || rows<1 || cols<1){
+    printf("ERROR: Something wrong with the following:\n");
+    printf("panels=%d\n",(int)panels);
+    printf("rows=%d\n",(int)rows);
+    printf("cols=%d\n",(int)cols);
+    assert(0);
+  }
+}
+
+void exafelSZ_params_checkComp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  if(pr->peaksSegs==NULL || pr->peaksRows==NULL || pr->peaksCols==NULL){
+    printf("ERROR: One or more of the following are NULL : peaksSegs , peaksRows , peaksCols\n");
+    assert(0);
+  }
+  exafelSZ_params_checkDecomp(pr, panels, rows, cols);
+}
+
+void exafelSZ_params_print(exafelSZ_params*pr){
+  printf("Configuration (exafelSZ_params) :\n");
+  printf("binSize: %d\n",pr->binSize);
+  printf("tolerance:%e\n",pr->tolerance);
+  printf("szDim:%d\n",pr->szDim);
+  printf("peakSize:%d\n",pr->peakSize);
+  //printf("nEvents:%d\n",pr->nEvents);
+  //printf("panels:%d\n",pr->panels);
+  //printf("rows:%d\n",pr->rows);
+  //printf("cols:%d\n",pr->cols);
+  printf("\n");
+  printf("CALCULATED VARIABLES\n");
+  printf("binnedRows:%ld\n",pr->binnedRows);
+  printf("binnedCols:%ld\n",pr->binnedCols);
+  printf("peakRadius:%d\n",pr->peakRadius);
+  printf("\n");
+  // outs<<"Configuration (exafelSZ_params) : "<<endl;
+  // outs<<"SMOOTHING: NO"<<"  (ROI and RONI are NOT replaced by local avg values)"<<endl;
+  // outs<<"binSize:"<<binSize<<endl;
+  // outs<<"tolerance:"<<tolerance<<endl;
+  // outs<<"szDim:"<<szDim<<endl;
+  // outs<<"peakSize:"<<peakSize<<endl;
+  // outs<<"nEvents:"<<nEvents<<" (# of events per batch)"<<endl;
+  // outs<<"panels:"<<panels<<" (Panels per event)"<<endl;
+  // outs<<"rows:"<<rows<<" (Rows per panel)"<<endl;
+  // outs<<"cols:"<<cols<<" (Columns per panel)"<<endl;
+  // outs<<endl;
+  // outs<<"CALCULATED VARIABLES"<<endl;
+  // outs<<"binnedRows:"<<binnedRows<<" (Rows per panel after binning)"<<endl;
+  // outs<<"binnedCols:"<<binnedCols<<" (Columns per panel after binning)"<<endl;
+  // outs<<"peakRadius:"<<peakRadius<<" (Peak radius = (peakSize-1)/2 )"<<endl;
+  // outs<<endl;
+}
+
+//*********************************************************************************
+//*********************************************************************************
+//*********************************************************************************
+
+//Index Calculator
+static inline size_t calcIdx_4D(int i3, int i2, int i1, int i0, int size2, int size1, int size0){ 
+  return i0+size0*(i1+size1*(i2+size2*i3));
+}
+static inline size_t calcIdx_3D(int i2, int i1, int i0, int size1, int size0){ 
+  return i0+size0*(i1+size1*i2);
+}
+static inline size_t calcIdx_2D(int i1, int i0, int size0){ 
+  return i0+size0*i1;
+}
+
+unsigned char * exafelSZ_Compress(void* _pr,
+                       void* _origData,
+                       size_t r4, size_t r3, size_t r2, size_t r1,
+                       size_t *compressedSize)
+{
+  //printf("COMPRESS\n"); *compressedSize=0; return NULL;
+  size_t nEvents,panels,rows,cols;
+  if(r4==0)
+    nEvents=1;
+  else
+    nEvents=r4;
+  panels=r1;
+  rows=r2;
+  cols=r3;
+  //printf("AMG : exafelSZ_Compress : nEvents,panels,rows,cols = %d , %d , %d , %d\n",nEvents,panels,rows,cols);
+
+  float *origData=(float*)_origData;
+  exafelSZ_params *pr=(exafelSZ_params*)_pr;  
+
+  exafelSZ_params_process(pr, panels, rows, cols);
+  exafelSZ_params_checkComp(pr, panels, rows, cols); 
+  //exafelSZ_params_print(pr);  
+
+  uint8_t *roiM=(uint8_t*)malloc(nEvents*panels*rows*cols) ;
+  float *roiData=(float*)malloc(nEvents*panels*rows*cols*sizeof(float)) ;
+  float *binnedData=(float*)malloc(nEvents*panels*pr->binnedRows*pr->binnedCols*sizeof(float)) ;
+  //float *binnedData=(float*)malloc(nEvents*panels*rows*cols*sizeof(float)) ;
+  
+  size_t e,p,r,c,pk,ri,ci,br,bc,roii,bi;
+  /*
+  printf("AMG : exafelSZ_Compress : pr->numPeaks = %d\n",pr->numPeaks);
+  printf("S:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksSegs[e]);
+  printf("\nR:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksRows[e]);
+  printf("\nC:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksCols[e]);
+  printf("\n");
+  */
+
+  //Generate the ROI mask: NOTE: 0 means affirmative in ROI mask! This comes from the python scripts!
+  //First, initialize with calibration panel:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          //roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_2D(r,c,cols)]; //calibPanel is a single segment copied over all the event(image)
+          roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_3D(p,r,c,rows,cols)];  //calibPanel is as big as the event(image) itself
+        }
+      }
+    }
+  }
+  //uint64_t peaksBytePos=0; //Position in the peaks buffer
+  //Now process the peaks and generate the mask:
+  uint64_t nPeaksTotal=0;  //Total number of peaks
+  for(e=0;e<nEvents;e++){ //Event
+    //uint64_t nPeaks=*(uint64_t*)(&pr->peaks[peaksBytePos]);
+    //peaksBytePos+=8;
+
+    //peaksBytePos+=8;//Skip the second one! This is due to the problem in Python.
+
+    nPeaksTotal+=pr->numPeaks;
+    for(pk=0;pk<pr->numPeaks;pk++){
+      //uint16_t p_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Panel for the current peak
+      //peaksBytePos+=2;
+      //uint16_t r_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Row for the current peak
+      //peaksBytePos+=2;
+      //uint16_t c_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Col for the current peak
+      //peaksBytePos+=2;
+      
+      uint16_t p_=pr->peaksSegs[pk];
+      uint16_t r_=pr->peaksRows[pk];
+      uint16_t c_=pr->peaksCols[pk];
+
+      if(p_>=panels){
+        printf("ERROR: Peak coordinate out of bounds: Panel=%d, Valid range: 0,%d\n",(int)p_,(int)panels-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(r_>=rows){
+        printf("ERROR: Peak coordinate out of bounds: Row=%d, Valid range: 0,%d\n",(int)r_,(int)rows-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(c_>=cols){
+        printf("ERROR: Peak coordinate out of bounds: Col=%d, Valid range: 0,%d\n",(int)c_,(int)cols-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      
+      for(ri=r_-pr->peakRadius;ri<=r_+pr->peakRadius;ri++){  //ri: row index. Just a temporary variable.
+        for(ci=c_-pr->peakRadius;ci<=c_+pr->peakRadius;ci++){  //ci: column index. Just a temporary variable.
+          if(ri<rows && ci<cols){  //Check whether inside the bounds or not
+            roiM[calcIdx_4D(e,p_,ri,ci,panels,rows,cols)]=0;
+          }
+        }
+      }
+    }
+  }
+  
+  //Save ROI:
+  uint64_t roiSavedCount=0;
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          if(!roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]){
+            roiData[roiSavedCount]=origData[calcIdx_4D(e,p,r,c,panels,rows,cols)];
+            roiSavedCount++;
+          }
+          
+          //AMG: Replace ROI and RONI pixels with avg values!
+          
+        }
+      }
+    }
+  }
+  
+  //Binning:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){  //Panel
+      for(r=0;r<pr->binnedRows;r++){ //Row of the binnedData
+        for(c=0;c<pr->binnedCols;c++){ //Column of the binnedData
+          float sum=0;
+          int nPts=0;
+          for(br=0;br<pr->binSize;br++) //Bin Row (from origData)
+            for(bc=0;bc<pr->binSize;bc++) //Bin Column (from origData)
+              if(r*pr->binSize+br<rows && c*pr->binSize+bc<cols){
+                // cout<<p<<" "<<r<<" "<<c<<" "<<br<<" "<<bc<<" "<<r*pr->binSize+br<<" "<<c*pr->binSize+bc<<endl;
+                sum+=origData[calcIdx_4D(e,p,r*pr->binSize+br,c*pr->binSize+bc,panels,rows,cols)];
+                nPts++;
+              }
+          // cout<<"p:"<<p<<" r:"<<r<<" c:"<<c<<" nPts:"<<nPts<<endl;
+          binnedData[calcIdx_4D(e,p,r,c,panels,pr->binnedRows,pr->binnedCols)]=sum/nPts;
+        }
+      }
+    }
+  }
+
+  //Additional compression using SZ:    
+  size_t szCompressedSize=0;
+  unsigned char* szComp;
+   
+  switch(pr->szDim){
+    case 1:
+      // szComp=sz_compress_3D(binnedData, 0, 0, nEvents * panels * pr->binnedRows * pr->binnedCols, pr->tolerance, szCompressedSize); //1D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0,0,0, nEvents * panels * pr->binnedRows * pr->binnedCols);
+      break;
+    case 2:
+      // szComp=sz_compress_3D(binnedData, 0, nEvents * panels * pr->binnedRows, pr->binnedCols, pr->tolerance, szCompressedSize); //2D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0,0, nEvents * panels * pr->binnedRows, pr->binnedCols);
+      break;
+    case 3:
+      // szComp=sz_compress_3D(binnedData, nEvents * panels, pr->binnedRows, pr->binnedCols, pr->tolerance, szCompressedSize); //3D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0, nEvents * panels, pr->binnedRows, pr->binnedCols);
+      break;
+    default:
+      printf("ERROR: Wrong szDim : %d It must be 1,2 or 3.\n",(int)pr->szDim);
+      assert(0);
+  }
+  
+  /*      
+  Compressed buffer format: (Types are indicated in parenthesis)
+    WRITE: nPeaksTotal(uint64_t) (Total number of peaks in this batch)
+    for(e=0;e<nEvents;e++){  (e for "event")
+      WRITE: nPeaks[e]  (uint64_t) (Number of peaks in this event)
+      for(p=0;p<nPeaks;p++){  (p for "peak")
+       nPeaks{
+         WRITE: peak[e][p] (uint16_t x 3)
+       }
+    }
+    WRITE: roiSavedCount  (uint64_t) (How many pixels there are in the ROI data.)
+       (roiSavedCount is the same # as # of 0's in ROI mask.) 
+       (NOTE:0 means affirmative in ROI mask!)
+    for(roii=0;roii<roiSavedCount;roii++){  (roii for "ROI data index")
+      WRITE: ROI_data[roii]  (float, 32-bit)
+    }
+    WRITE: szCompressedSize  (uint64_t) (Compressed data size from SZ.)
+    WRITE: szComp (unsigned char x SZ_compressed_buffer_size)  (Compressed data from SZ.)
+    
+    NOTE: Calibration panel is not saved. It should be handled by the user.
+    
+    SUMMARY:
+    nPeaksTotal : 8 bytes : (1 x uint64_t)
+    peaks : (8 x nEvents + nPeaksTotal x 3 x 2) bytes : (nEvents x (nPeaks + nPeaks x 3 x uint16_t))
+    roiSavedCount : 8 Bytes : (1 x uint64_t)
+    ROI_data : roiSavedCount x 4 : roiSavedCount x float 
+    szCompressedSize : 8 : uint64_t
+    szComp : szComp x 1 : szComp x (unsigned char)
+  */
+  (*compressedSize)=8+nEvents*8+nPeaksTotal*(2+2+2)+8+roiSavedCount*4+8+szCompressedSize;
+  //compressedBuffer=new uint8_t[(*compressedSize)];
+  uint8_t * compressedBuffer=(uint8_t*)malloc(*compressedSize);
+  uint64_t bytePos;
+  
+  bytePos=0;
+  //*(uint64_t*)(&compressedBuffer[bytePos])=nEvents;
+  //bytePos+=8;
+  *(uint64_t*)(&compressedBuffer[bytePos])=nPeaksTotal;
+  bytePos+=8;
+  // cout<<endl;
+  // cout<<"COMPRESS:"<<endl;
+  // cout<<"nPeaksTotal="<<nPeaksTotal<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("\nCOMPRESS:\n");
+  //printf("nPeaksTotal=%d\n",nPeaksTotal);
+  //printf("bytePos=%d\n",bytePos);
+  
+  //peaksBytePos=0;
+  for(e=0;e<nEvents;e++){
+    //uint64_t nPeaks=*(uint64_t*)(&pr->peaks[peaksBytePos]);
+    //peaksBytePos+=8;
+    ////peaksBytePos+=8;//Skip the second one. This is due to the error in Python!
+    
+    //*(uint64_t*)(&compressedBuffer[bytePos])=nPeaks;
+    *(uint64_t*)(&compressedBuffer[bytePos])=pr->numPeaks;
+    bytePos+=8;
+    //for(pk=0;pk<nPeaks;pk++){
+    for(pk=0;pk<pr->numPeaks;pk++){
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Panel for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Row for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;      
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Column for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;
+
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksSegs[pk]; //Panel for the current peak
+      bytePos+=2;
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksRows[pk]; //Row for the current peak
+      bytePos+=2;
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksCols[pk]; //Column for the current peak
+      bytePos+=2;
+    }
+  }
+  // cout<<"peaks"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("peaks\n");
+  //printf("bytePos=%d\n",bytePos);
+
+  *(uint64_t*)(&compressedBuffer[bytePos])=roiSavedCount;
+  bytePos+=8;
+  // cout<<"roiSavedCount="<<roiSavedCount<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  // cout<<"roiData"<<endl;
+  //printf("roiSavedCount=%d\n",roiSavedCount);
+  //printf("bytePos=%d\n",bytePos);
+  //printf("roiData\n");
+  for(roii=0;roii<roiSavedCount;roii++){
+    *(float*)(&compressedBuffer[bytePos])=roiData[roii];
+    // cout<<roiData[roii]<<",";
+    bytePos+=4;
+  }
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("bytePos=%d\n",bytePos);
+  *(uint64_t*)(&compressedBuffer[bytePos])=szCompressedSize;
+  bytePos+=8;
+  // cout<<"szCompressedSize="<<szCompressedSize<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szCompressedSize=%d\n",szCompressedSize);
+  //printf("bytePos=%d\n",bytePos);
+  for(bi=0;bi<szCompressedSize;bi++){  //bi for "byte index"
+    *(unsigned char*)(&compressedBuffer[bytePos])=szComp[bi];
+    bytePos+=1;
+  }
+  // cout<<"szComp"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szComp\n");
+  //printf("bytePos=%d\n",bytePos);
+  
+  if(bytePos!=(*compressedSize)){
+    printf("ERROR: bytePos = %ld != %ld = compressedSize\n",(long)bytePos,(long)compressedSize);
+    assert(0);
+  }
+  
+  free(szComp);
+  free(roiM);
+  free(roiData);
+  free(binnedData);
+  // delete [] roiM;
+  // delete [] roiData;
+  // delete [] binnedData;
+  
+  return compressedBuffer;
+}
+
+void* exafelSZ_Decompress(void *_pr,
+                         unsigned char*_compressedBuffer,
+                         size_t r4, size_t r3, size_t r2, size_t r1,
+                         size_t compressedSize)
+{ 
+  size_t nEvents,panels,rows,cols;
+  if(r4==0)
+    nEvents=1;
+  else
+    nEvents=r4;
+  panels=r1;
+  rows=r2;
+  cols=r3;
+  //printf("AMG : exafelSZ_Decompress : nEvents,panels,rows,cols = %d , %d , %d , %d\n",nEvents,panels,rows,cols);
+
+  //printf("DECOMPRESS\n");return NULL;
+  uint8_t *compressedBuffer=(uint8_t *)_compressedBuffer;
+  exafelSZ_params *pr=(exafelSZ_params *)_pr;
+  exafelSZ_params_process(pr, panels, rows, cols); 
+  exafelSZ_params_checkDecomp(pr, panels, rows, cols); 
+  
+  float *decompressedBuffer=(float*)malloc(nEvents*panels*rows*cols*sizeof(float));
+  
+  uint8_t *roiM=(uint8_t*)malloc(nEvents*panels*rows*cols);
+  size_t e,p,r,c,pk,ri,ci,br,bc;
+  
+  /*
+  Compressed Data Layout:
+  nPeaksTotal : 8 bytes : (1 x uint64_t)
+  peaks : (8 x nEvents + nPeaksTotal x 3 x 2) bytes : (nEvents x (nPeaks + nPeaks x 3 x uint16_t))
+  roiSavedCount : 8 Bytes : (1 x uint64_t)
+  ROI_data : roiSavedCount x 4 : roiSavedCount x float 
+  szCompressedSize : 8 : uint64_t
+  szComp : szComp x 1 : szComp x (unsigned char)
+  */
+  uint64_t bytePos=0;
+  uint64_t nPeaksTotal=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos += 8; 
+  // cout<<endl;
+  // cout<<"DECOMPRESS:"<<endl;
+  // cout<<"nPeaksTotal="<<nPeaksTotal<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("\nDECOMPRESS:\n");
+  //printf("nPeaksTotal=%d\n",nPeaksTotal);
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint8_t *peaks=(uint8_t*)(&compressedBuffer[bytePos]);
+  bytePos += (8 * nEvents + nPeaksTotal * 3 * 2);
+  // cout<<"peaks"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("peaks\n");
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint64_t roiSavedCount=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos+=8;
+  // cout<<"roiSavedCount="<<roiSavedCount<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("roiSavedCount=%d\n",roiSavedCount);
+  //printf("bytePos=%d\n",bytePos);
+  
+  // cout<<"roiData"<<endl;
+  float *roiData=(float*)(&compressedBuffer[bytePos]);
+  bytePos+=(roiSavedCount*4);
+  // for(uint64_t roii=0;roii<roiSavedCount;roii++){
+    // cout<<roiData[roii]<<",";
+  // }
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint64_t szCompressedSize=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos+=8;
+  // cout<<"szCompressedSize="<<szCompressedSize<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szCompressedSize=%d\n",szCompressedSize);
+  //printf("bytePos=%d\n",bytePos);
+  
+  unsigned char *szComp=(unsigned char*)(&compressedBuffer[bytePos]);
+  bytePos+=szCompressedSize;
+  // cout<<"szComp"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  // cout<<endl;
+  //printf("szComp\n");
+  //printf("bytePos=%d\n\n",bytePos);
+  
+  //We should have inputs ready by now. Now process them:
+  
+  //Generate the ROI mask: NOTE: 0 means affirmative in ROI mask! This comes from the python scripts!
+  //First, initialize with calibration panel:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          if(calcIdx_2D(r,c,cols)<0 ||calcIdx_2D(r,c,cols)>=rows*cols){
+            printf("ERROR: calcIdx_2D(r,c,cols) = calcIdx_2D(%d,%d,%d) = %d",(int)r,(int)c,(int)cols,(int)calcIdx_2D(r,c,cols));
+            printf("       is NOT in the correct range: [0,%ld]",(int)rows*cols-1);
+            assert(0);
+          }
+          if(calcIdx_4D(e,p,r,c,panels,rows,cols)<0 ||calcIdx_4D(e,p,r,c,panels,rows,cols)>=nEvents*panels*rows*cols){
+            printf("ERROR: calcIdx_4D(e,p,r,c,panels,rows,cols) = calcIdx_4D(%d,%d,%d,%d,%d,%d,%d) = %d",(int)e,(int)p,(int)r,(int)c,(int)panels,(int)rows,(int)cols,(int)calcIdx_4D(e,p,r,c,panels,rows,cols));
+            assert(0);
+          }
+          //roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_2D(r,c,cols)]; //calibPanel is a single segment copied over all the event(image)
+          roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_3D(p,r,c,rows,cols)];  //calibPanel is as big as the event(image) itself
+        }
+      }
+    }
+  }
+  uint64_t peaksBytePos=0; //Position in the peaks buffer
+  //Now process the peaks and generate the mask:
+  for(e=0;e<nEvents;e++){ //Event
+    uint64_t nPeaks=*(uint64_t*)(&peaks[peaksBytePos]);
+    peaksBytePos+=8;
+    
+    for(pk=0;pk<nPeaks;pk++){
+      uint16_t p_=*(uint16_t*)(&peaks[peaksBytePos]); //Panel for the current peak
+      peaksBytePos+=2;
+      uint16_t r_=*(uint16_t*)(&peaks[peaksBytePos]); //Row for the current peak
+      peaksBytePos+=2;
+      uint16_t c_=*(uint16_t*)(&peaks[peaksBytePos]); //Col for the current peak
+      peaksBytePos+=2;
+      
+      if(p_>=panels){
+        printf("ERROR: Peak coordinate out of bounds: Panel=%d, Valid range: 0,%d\n",(int)p_,(int)panels-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(r_>=rows){
+        printf("ERROR: Peak coordinate out of bounds: Row=%d, Valid range: 0,%d\n",(int)r_,(int)rows-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(c_>=cols){
+        printf("ERROR: Peak coordinate out of bounds: Col=%d, Valid range: 0,%d\n",(int)c_,(int)cols-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      
+      for(ri=r_-pr->peakRadius;ri<=r_+pr->peakRadius;ri++){  //ri: row index. Just a temporary variable.
+        for(ci=c_-pr->peakRadius;ci<=c_+pr->peakRadius;ci++){  //ci: column index. Just a temporary variable.
+          if(ri>=0 && ri<rows && ci>=0 && ci<cols){  //Check whether inside bounds or not
+            roiM[calcIdx_4D(e,p_,ri,ci,panels,rows,cols)]=0;
+          }
+        }
+      }
+    }
+  }
+  
+  //De-compress using SZ:
+  float* szDecomp;
+  size_t _szCompressedSize=szCompressedSize;
+  switch(pr->szDim){
+    case 1:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,0,0, nEvents * panels * pr->binnedRows * pr->binnedCols);
+      break;
+    case 2:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,0, nEvents * panels * pr->binnedRows, pr->binnedCols);
+      break;
+    case 3:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,nEvents * panels, pr->binnedRows, pr->binnedCols);
+      break;
+    default:
+      printf("ERROR: Wrong szDim : %d It must be 1,2 or 3.\n",(int)pr->szDim);
+      assert(0);
+  }
+  //szDecomp=(void*)malloc(nEvents*panels*rows*cols*sizeof(float));
+  
+  // double max_err = 0;
+  // for(int i=0; i<nEvents * panels * pr->binnedRows * pr->binnedCols; i++){
+    // double err = fabs(szDecomp[i]-binnedData[i]);
+    // if(err > max_err) max_err = err;
+  // }
+  // cout << "Max err = \t\t\t" << max_err << endl;
+  
+
+  //De-binning:
+  for(e=0;e<nEvents;e++)//Event
+    for(p=0;p<panels;p++)  //Panel
+      for(r=0;r<pr->binnedRows;r++) //Row of the binnedData
+        for(c=0;c<pr->binnedCols;c++) //Column of the binnedData
+            for(br=0;br<pr->binSize;br++) //Bin Row (from origData)
+              for(bc=0;bc<pr->binSize;bc++) //Bin Column (from origData)
+                if(r*pr->binSize+br<rows && c*pr->binSize+bc<cols){
+                  decompressedBuffer[calcIdx_4D(e,p,r*pr->binSize+br,c*pr->binSize+bc,panels,rows,cols)] = szDecomp[calcIdx_4D(e,p,r,c,panels,pr->binnedRows,pr->binnedCols)];
+                }
+  //Restore ROI:
+  uint64_t current=0;
+  for(e=0;e<nEvents;e++)//Event
+    for(p=0;p<panels;p++)  //Panel
+      for(r=0;r<rows;r++) //Row of the binnedData
+        for(c=0;c<cols;c++) //Column of the binnedData
+          if(!roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]){
+            decompressedBuffer[calcIdx_4D(e,p,r,c,panels,rows,cols)]=roiData[current];
+            current++;
+          }
+  // delete [] roiM;
+  free(roiM);
+  free(szDecomp);
+  
+  return ((void*)decompressedBuffer);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/deps/SZ/sz/src/iniparser.c b/deps/SZ/sz/src/iniparser.c
new file mode 100644
index 0000000000000000000000000000000000000000..b076ed1d0f885b419c170eff5924614cd9239350
--- /dev/null
+++ b/deps/SZ/sz/src/iniparser.c
@@ -0,0 +1,774 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.c
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+/*---------------------------- Includes ------------------------------------*/
+#include <ctype.h>
+#include "iniparser.h"
+
+/*---------------------------- Defines -------------------------------------*/
+#define ASCIILINESZ         (1024)
+#define INI_INVALID_KEY     ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                        Private to this module
+ ---------------------------------------------------------------------------*/
+/**
+ * This enum stores the status for each parsed line (internal use only).
+ */
+typedef enum _line_status_ {
+    LINE_UNPROCESSED,
+    LINE_ERROR,
+    LINE_EMPTY,
+    LINE_COMMENT,
+    LINE_SECTION,
+    LINE_VALUE
+} line_status ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Convert a string to lowercase.
+  @param    s   String to convert.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string
+  containing a lowercased version of the input string. Do not free
+  or modify the returned string! Since the returned string is statically
+  allocated, it will be modified at each function call (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strlwc(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    int i ;
+
+    if (s==NULL) return NULL ;
+    memset(l, 0, ASCIILINESZ+1);
+    i=0 ;
+    while (s[i] && i<ASCIILINESZ) {
+        l[i] = (char)tolower((int)s[i]);
+        i++ ;
+    }
+    l[ASCIILINESZ]=(char)0;
+    return l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Remove blanks at the beginning and the end of a string.
+  @param    s   String to parse.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string,
+  which is identical to the input string, except that all blank
+  characters at the end and the beg. of the string have been removed.
+  Do not free or modify the returned string! Since the returned string
+  is statically allocated, it will be modified at each function call
+  (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strstrip(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    char * last;
+
+    if (s==NULL) return NULL ;
+
+    while (isspace((int)*s) && *s) s++;
+    memset(l, 0, ASCIILINESZ+1);
+    strncpy(l, s, ASCIILINESZ);
+    last = l + strlen(l);
+    while (last > l) {
+        if (!isspace((int)*(last-1)))
+            break ;
+        last -- ;
+    }
+    *last = (char)0;
+    return (char*)l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getnsec(dictionary * d)
+{
+    int i ;
+    int nsec ;
+
+    if (d==NULL) return -1 ;
+    nsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            nsec ++ ;
+        }
+    }
+    return nsec ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getsecname(dictionary * d, int n)
+{
+    int i ;
+    int foundsec ;
+
+    if (d==NULL || n<0) return NULL ;
+    foundsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            foundsec++ ;
+            if (foundsec>n)
+                break ;
+        }
+    }
+    if (foundsec<=n) {
+        return NULL ;
+    }
+    return d->key[i] ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f)
+{
+    int     i ;
+
+    if (d==NULL || f==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (d->val[i]!=NULL) {
+            fprintf(f, "[%s]=[%s]\n", d->key[i], d->val[i]);
+        } else {
+            fprintf(f, "[%s]=UNDEF\n", d->key[i]);
+        }
+    }
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump_ini(dictionary * d, FILE * f)
+{
+    int     i ;
+    int     nsec ;
+    char *  secname ;
+
+    if (d==NULL || f==NULL) return ;
+
+    nsec = iniparser_getnsec(d);
+    if (nsec<1) {
+        /* No section in file: dump all keys as they are */
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            fprintf(f, "%s = %s\n", d->key[i], d->val[i]);
+        }
+        return ;
+    }
+    for (i=0 ; i<nsec ; i++) {
+        secname = iniparser_getsecname(d, i) ;
+        iniparser_dumpsection_ini(d, secname, f) ;
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f)
+{
+    int     j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen ;
+
+    if (d==NULL || f==NULL) return ;
+    if (! iniparser_find_entry(d, s)) return ;
+
+    seclen  = (int)strlen(s);
+    fprintf(f, "\n[%s]\n", s);
+    sprintf(keym, "%s:", s);
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            fprintf(f,
+                    "%-30s = %s\n",
+                    d->key[j]+seclen+1,
+                    d->val[j] ? d->val[j] : "");
+        }
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s)
+{
+    int     seclen, nkeys ;
+    char    keym[ASCIILINESZ+1];
+    int j ;
+
+    nkeys = 0;
+
+    if (d==NULL) return nkeys;
+    if (! iniparser_find_entry(d, s)) return nkeys;
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1))
+            nkeys++;
+    }
+
+    return nkeys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s)
+{
+
+    char **keys;
+
+    int i, j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen, nkeys ;
+
+    keys = NULL;
+
+    if (d==NULL) return keys;
+    if (! iniparser_find_entry(d, s)) return keys;
+
+    nkeys = iniparser_getsecnkeys(d, s);
+
+    keys = (char**) malloc(nkeys*sizeof(char*));
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    i = 0;
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            keys[i] = d->key[j];
+            i++;
+        }
+    }
+
+    return keys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def)
+{
+    char * lc_key ;
+    char * sval ;
+
+    if (d==NULL || key==NULL)
+        return def ;
+
+    lc_key = strlwc(key);
+    sval = dictionary_get(d, lc_key, def);
+    return sval ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  "42"      ->  42
+  "042"     ->  34 (octal -> decimal)
+  "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return (int)strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return atof(str);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound)
+{
+    char    *   c ;
+    int         ret ;
+
+    c = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (c==INI_INVALID_KEY) return notfound ;
+    if (c[0]=='y' || c[0]=='Y' || c[0]=='1' || c[0]=='t' || c[0]=='T') {
+        ret = 1 ;
+    } else if (c[0]=='n' || c[0]=='N' || c[0]=='0' || c[0]=='f' || c[0]=='F') {
+        ret = 0 ;
+    } else {
+        ret = notfound ;
+    }
+    return ret;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(
+    dictionary  *   ini,
+    const char  *   entry
+)
+{
+    int found=0 ;
+    if (iniparser_getstring(ini, entry, INI_INVALID_KEY)!=INI_INVALID_KEY) {
+        found = 1 ;
+    }
+    return found ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val)
+{
+    return dictionary_set(ini, strlwc(entry), val) ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry)
+{
+    dictionary_unset(ini, strlwc(entry));
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Load a single line from an INI file
+  @param    input_line  Input line, may be concatenated multi-line input
+  @param    section     Output space to store section
+  @param    key         Output space to store key
+  @param    value       Output space to store value
+  @return   line_status value
+ */
+/*--------------------------------------------------------------------------*/
+static line_status iniparser_line(
+    const char * input_line,
+    char * section,
+    char * key,
+    char * value)
+{
+    line_status sta ;
+    char        line[ASCIILINESZ+1];
+    int         len ;
+
+    memset(line, 0, ASCIILINESZ + 1);
+    len = (int)strlen(strstrip(input_line));
+    if (len > ASCIILINESZ)
+        len = ASCIILINESZ;
+    strncpy(line, strstrip(input_line), len);
+    len = (int)strlen(line);
+
+    sta = LINE_UNPROCESSED ;
+    if (len<1) {
+        /* Empty line */
+        sta = LINE_EMPTY ;
+    } else if (line[0]=='#' || line[0]==';') {
+        /* Comment line */
+        sta = LINE_COMMENT ;
+    } else if (line[0]=='[' && line[len-1]==']') {
+        /* Section name */
+        sscanf(line, "[%[^]]", section);
+        strcpy(section, strstrip(section));
+        strcpy(section, strlwc(section));
+        sta = LINE_SECTION ;
+    } else if (sscanf (line, "%[^=] = \"%[^\"]\"", key, value) == 2
+           ||  sscanf (line, "%[^=] = '%[^\']'",   key, value) == 2
+           ||  sscanf (line, "%[^=] = %[^;#]",     key, value) == 2) {
+        /* Usual key=value, with or without comments */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        strcpy(value, strstrip(value));
+        /*
+         * sscanf cannot handle '' or "" as empty values
+         * this is done here
+         */
+        if (!strcmp(value, "\"\"") || (!strcmp(value, "''"))) {
+            value[0]=0 ;
+        }
+        sta = LINE_VALUE ;
+    } else if (sscanf(line, "%[^=] = %[;#]", key, value)==2
+           ||  sscanf(line, "%[^=] %[=]", key, value) == 2) {
+        /*
+         * Special cases:
+         * key=
+         * key=;
+         * key=#
+         */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        value[0]=0 ;
+        sta = LINE_VALUE ;
+    } else {
+        /* Generate syntax error */
+        sta = LINE_ERROR ;
+        printf("===== > %s   ===> %s\n", input_line, line);
+    }
+    return sta ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame)
+{
+    FILE * in ;
+
+    char line    [ASCIILINESZ+1] ;
+    char section [ASCIILINESZ+1] ;
+    char key     [ASCIILINESZ+1] ;
+    char tmp     [ASCIILINESZ+1] ;
+    char val     [ASCIILINESZ+1] ;
+
+    int  last=0 ;
+    int  len ;
+    int  lineno=0 ;
+    int  errs=0;
+
+    dictionary * dict ;
+
+    if ((in=fopen(ininame, "r"))==NULL) {
+        fprintf(stderr, "iniparser: cannot open %s\n", ininame);
+        return NULL ;
+    }
+
+    dict = dictionary_new(0) ;
+    if (!dict) {
+        fclose(in);
+        return NULL ;
+    }
+
+    memset(line,    0, ASCIILINESZ);
+    memset(section, 0, ASCIILINESZ);
+    memset(key,     0, ASCIILINESZ);
+    memset(val,     0, ASCIILINESZ);
+    last=0 ;
+
+    while (fgets(line+last, ASCIILINESZ-last, in)!=NULL) {
+        lineno++ ;
+        len = (int)strlen(line)-1;
+        if (len==0)
+            continue;
+        /* Safety check against buffer overflows */
+        if (line[len]!='\n') {
+            fprintf(stderr,
+                    "iniparser: input line too long in %s (%d)\n",
+                    ininame,
+                    lineno);
+            dictionary_del(dict);
+            fclose(in);
+            return NULL ;
+        }
+        /* Get rid of \n and spaces at end of line */
+        while ((len>=0) &&
+                ((line[len]=='\n') || (isspace(line[len])))) {
+            line[len]=0 ;
+            len-- ;
+        }
+        /* Detect multi-line */
+        if (line[len]=='\\') {
+            /* Multi-line value */
+            last=len ;
+            continue ;
+        } else {
+            last=0 ;
+        }
+        switch (iniparser_line(line, section, key, val)) {
+            case LINE_EMPTY:
+            case LINE_COMMENT:
+            break ;
+
+            case LINE_SECTION:
+            errs = dictionary_set(dict, section, NULL);
+            break ;
+
+            case LINE_VALUE:
+            sprintf(tmp, "%s:%s", section, key);
+            errs = dictionary_set(dict, tmp, val) ;
+            break ;
+
+            case LINE_ERROR:
+            fprintf(stderr, "iniparser: syntax error in %s (%d):\n",
+                    ininame,
+                    lineno);
+            fprintf(stderr, "-> %s\n", line);
+            errs++ ;
+            break;
+
+            default:
+            break ;
+        }
+        memset(line, 0, ASCIILINESZ);
+        last=0;
+        if (errs<0) {
+            fprintf(stderr, "iniparser: memory allocation failure\n");
+            break ;
+        }
+    }
+    if (errs) {
+        dictionary_del(dict);
+        dict = NULL ;
+    }
+    fclose(in);
+    return dict ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d)
+{
+    dictionary_del(d);
+}
+
+/* vim: set ts=4 et sw=4 tw=75 */
diff --git a/deps/SZ/sz/src/pastri.c b/deps/SZ/sz/src/pastri.c
new file mode 100644
index 0000000000000000000000000000000000000000..7c6908b5f35360351c74bd994e4cf540cf54953c
--- /dev/null
+++ b/deps/SZ/sz/src/pastri.c
@@ -0,0 +1,87 @@
+#include "pastri.h"
+#include "pastriD.h"
+#include "pastriF.h"
+
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr){
+  FILE *paramsF;
+  paramsF=fopen(paramsFilename,"r");
+  
+  if(paramsF==NULL){
+    printf("ERROR: Parameters file cannot be opened.\n");
+    printf("Filename: %s\n",paramsFilename);
+    assert(0);
+  }
+  
+  fscanf(paramsF,"%d %d %d %d %lf %d %d",&paramsPtr->bf[0],&paramsPtr->bf[1],&paramsPtr->bf[2],&paramsPtr->bf[3],&paramsPtr->originalEb,&paramsPtr->dataSize,&paramsPtr->numBlocks);
+  //printf("Params: %d %d %d %d %.3e %d\n",paramsPtr->bf[0],paramsPtr->bf[1],paramsPtr->bf[2],paramsPtr->bf[3],paramsPtr->originalEb,paramsPtr->numBlocks);
+  fclose(paramsF);
+}
+
+void SZ_pastriPreprocessParameters(pastri_params *p){
+  //Preprocess by calculating some pastri_params:
+  //Calculate sbSize, sbNum, etc.:
+  p->idxRange[0]=(p->bf[0]+1)*(p->bf[0]+2)/2;
+  p->idxRange[1]=(p->bf[1]+1)*(p->bf[1]+2)/2;
+  p->idxRange[2]=(p->bf[2]+1)*(p->bf[2]+2)/2;
+  p->idxRange[3]=(p->bf[3]+1)*(p->bf[3]+2)/2;
+  p->sbSize=p->idxRange[2]*p->idxRange[3];
+  p->sbNum=p->idxRange[0]*p->idxRange[1];
+  p->bSize=p->sbSize*p->sbNum;
+  p->usedEb=p->originalEb*0.999;  //This is needed just to eliminate some rounding errors. It has almost no effect on compression rate/ratios.
+}
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes){
+  (*compressedBufP) = (unsigned char*)calloc(p->numBlocks*p->bSize*p->dataSize,sizeof(char));
+  int bytes; //bytes for this block
+  int i;
+  size_t bytePos=0; //Current byte pos in the outBuf
+  
+  memcpy(*compressedBufP, p, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }
+    bytePos+=bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  *compressedBytes=bytePos;
+  //printf("totalBytesWritten:%d\n",*compressedBytes);
+}
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes){
+  int bytePos=0; //Current byte pos in the outBuf 
+  memcpy(p, compressedBuf, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);	
+	
+  (*decompressedBufP) = (unsigned char*)malloc(p->numBlocks*p->bSize*p->dataSize*sizeof(char)); 
+  int bytes; //bytes for this block
+  int i;
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }
+          
+    bytePos += bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  //printf("totalBytesRead:%d\n",bytePos);
+  *decompressedBytes=p->numBlocks*p->bSize*p->dataSize;
+}
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf){        
+  int i;
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }else if(p->dataSize==4){
+      pastri_float_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }
+  }
+}
diff --git a/deps/SZ/sz/src/rw.c b/deps/SZ/sz/src/rw.c
new file mode 100644
index 0000000000000000000000000000000000000000..c023645597dcbe2bbaede608e53d9eb7bbc529b0
--- /dev/null
+++ b/deps/SZ/sz/src/rw.c
@@ -0,0 +1,1070 @@
+/**
+ *  @file rw.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief io interface for fortrance
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "rw.h"
+#include "sz.h"
+
+int checkFileExistance(char* filePath)
+{
+	if( access( filePath, F_OK ) != -1 ) {
+		// file exists
+		return 1;
+	} else {
+		// file doesn't exist
+		return 0;
+	}	
+}
+
+float** create2DArray_float(size_t m, size_t n)
+{
+	size_t i=0;
+	float **data = (float**)malloc(sizeof(float*)*m);
+	for(i=0;i<m;i++)
+		data[i] = (float*)malloc(sizeof(float)*n);
+	return data;
+}
+
+void free2DArray_float(float** data, size_t m)
+{
+	size_t i = 0;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+float*** create3DArray_float(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	float ***data = (float***)malloc(sizeof(float**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (float**)malloc(sizeof(float*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (float*)malloc(sizeof(float)*n);
+	}
+	return data;
+}
+
+void free3DArray_float(float*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+double** create2DArray_double(size_t m, size_t n)
+{
+	size_t i=0;
+	double **data = (double**)malloc(sizeof(double*)*m);
+	for(i=0;i<m;i++)
+			data[i] = (double*)malloc(sizeof(double)*n);
+			
+	return data;
+}
+
+void free2DArray_double(double** data, size_t m)
+{
+	size_t i;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+double*** create3DArray_double(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	double ***data = (double***)malloc(sizeof(double**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (double**)malloc(sizeof(double*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (double*)malloc(sizeof(double)*n);
+	}
+	return data;
+}
+
+void free3DArray_double(double*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+size_t checkFileSize(char *srcFilePath, int *status)
+{
+	size_t filesize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return -1;
+	}
+	fseek(pFile, 0, SEEK_END);
+    filesize = ftell(pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return filesize;
+}
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return byteBuf;
+}
+
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		double *daBuf = readDoubleData_systemEndian(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+		
+		ldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	int8_t *daBuf = readInt8Data_systemEndian(srcFilePath, nbEle, &state);
+	*status = state;
+	return daBuf;
+}
+
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int16_t *daBuf = readInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int16_t *daBuf = (int16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.svalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint16_t *daBuf = readUInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint16_t *daBuf = (uint16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.usvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int32_t *daBuf = readInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int32_t *daBuf = (int32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.ivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint32_t *daBuf = readUInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint32_t *daBuf = (uint32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 2; //*4
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.uivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int64_t *daBuf = readInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int64_t *daBuf = (int64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.lvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint64_t *daBuf = readUInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint64_t *daBuf = (uint64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.ulvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		float *daBuf = readFloatData_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		lfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+    
+    double *daBuf = (double *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize;
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int8_t *daBuf = (int8_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 1, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int16_t *daBuf = (int16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint16_t *daBuf = (uint16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int32_t *daBuf = (int32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint32_t *daBuf = (uint32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int64_t *daBuf = (int64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint64_t *daBuf = (uint64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    for(i = 0;i<nbEle;i++)
+	{
+		sprintf(s,"%.20G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+   
+    for(i = 0;i<nbEle;i++)
+	{
+		//printf("i=%d\n",i);
+		//printf("data[i]=%f\n",data[i]);
+		sprintf(s,"%.30G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeDataSZ(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	if(dataType == SZ_FLOAT)
+	{
+		float* dataArray = (float *)data;
+		writeFloatData(dataArray, nbEle, tgtFilePath, &state);
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* dataArray = (double *)data;
+		writeDoubleData(dataArray, nbEle, tgtFilePath, &state);	
+	}
+	else
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		*status = SZ_TERR; //wrong type
+		return;
+	}
+	*status = state;
+}
+
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = SZ_SCES;
+	lfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0; 
+	int state = SZ_SCES;
+	ldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertLongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertULongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status)
+{
+	size_t byteLength = 0; 
+	int state = SZ_SCES;
+	unsigned char * bytes = readByteData(srcFilePath, &byteLength, &state);
+	*dataLength = byteLength/2;
+	unsigned short* states = convertByteDataToUShortArray(bytes, byteLength);
+	free(bytes);
+	*status = state;
+	return states;
+}
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[256];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_FERR;
+		return;
+	}
+
+	for(i = 0;i<nbStr;i++)
+	{
+		sprintf(s,"%s\n",str[i]);
+		fputs(s, pFile);
+	}
+
+	fclose(pFile);
+	*status = SZ_SCES;
+}
+
+/*
+//@deprecated
+//binToPFM_float is to convert the floating-point data to PFM supported by Jpeg XT
+//But wrong version!
+//In order to do the conversion, we need to use https://github.com/thorfdbg/difftest_ng according to Thomas Richter.
+
+
+void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status)
+{
+	size_t i, nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_NSCS;
+		return;
+	}	
+	fputs("PF\n", pFile);
+	char strBuf[256];
+	switch(dim)
+	{
+	case 1: 
+		sprintf(strBuf, "%zu\n", r1);
+		break;
+	case 2:
+		sprintf(strBuf, "%zu %zu\n", r1, r2);
+		break;
+	case 3:
+		sprintf(strBuf, "%zu %zu %zu\n", r1, r2, r3);
+		break;
+	case 4:
+		sprintf(strBuf, "%zu %zu %zu %zu\n", r1, r2, r3, r4);
+		break;
+	case 5:
+		sprintf(strBuf, "%zu %zu %zu %zu %zu\n", r1, r2, r3, r4, r5);
+		break;
+	}
+	fputs(strBuf, pFile);
+	if(endianType==LITTLE_ENDIAN)
+		fputs("-1.0\n", pFile);
+	else
+		fputs("1.0\n", pFile);
+
+	size_t byteLength = nbEle*sizeof(float);	
+	lfloat buf;	
+	unsigned char* bytes = (unsigned char*)malloc(byteLength);
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];
+	}
+	
+	fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+	fclose(pFile);
+	
+	free(bytes);
+	*status = SZ_SCES;
+}*/
diff --git a/deps/SZ/sz/src/rw_interface.F90 b/deps/SZ/sz/src/rw_interface.F90
new file mode 100644
index 0000000000000000000000000000000000000000..b7e8726bbca1b27a8ff43e2ccce2bc4eee091ad6
--- /dev/null
+++ b/deps/SZ/sz/src/rw_interface.F90
@@ -0,0 +1,205 @@
+!  @file   sdc_interface.F90
+!  @author Sheng Di (disheng222@gmail.com)
+!  @date   Aug., 2014
+!  @ Mathematics and Computer Science (MCS)
+!  @ Argonne National Laboratory, Lemont, USA.
+!  @brief  The key Fortran binding file to connect C language and Fortran (Fortran part)
+
+
+MODULE RW
+	use :: ISO_C_BINDING
+
+	INTERFACE writeData
+		MODULE PROCEDURE WriteData_inBinary_d1_INTEGER_K1
+		MODULE PROCEDURE WriteData_inBinary_d1_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d2_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d3_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d4_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d5_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d1_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d2_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d3_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d4_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d5_REAL_K8
+	END INTERFACE writeData
+
+	INTERFACE readData
+		MODULE PROCEDURE readByteData
+		MODULE PROCEDURE readFloatData
+		MODULE PROCEDURE readDoubleData
+	END INTERFACE readData
+
+	CONTAINS
+
+	!Bytes here could be an "allocatable" array, so it requires an extra "byteLength" io indicate the length (can't use size(Bytes))
+	SUBROUTINE WriteData_inBinary_d1_INTEGER_K1(Bytes, byteLength, FILE_PATH)
+		implicit none
+		INTEGER(KIND=1), DIMENSION(:) :: Bytes
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(KIND=C_SIZE_T) :: byteLength
+
+		CALL writeByteFile(Bytes, byteLength, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_INTEGER_K1
+
+	SUBROUTINE WriteData_inBinary_d1_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(VAR, nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d2_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d2_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d3_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d3_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d4_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d4_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d5_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d5_REAL_K4
+
+!write data in binary for K8 data
+
+	SUBROUTINE WriteData_inBinary_d1_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(VAR, nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d2_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d2_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d3_REAL_K8(VAR, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d3_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d4_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d4_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d5_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d5_REAL_K8
+
+!Check file size
+	SUBROUTINE checkFileSize(FILE_PATH, BYTESIZE)
+		implicit none
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: BYTESIZE
+
+		CALL checkFileSizeC(FILE_PATH, len(trim(FILE_PATH)), BYTESIZE)
+	END SUBROUTINE checkFileSize
+
+!Read data
+	SUBROUTINE readByteData(FILE_PATH, Bytes, outSize)
+		implicit none
+		INTEGER(KIND=1), DIMENSION(:), allocatable :: temp
+		INTEGER(KIND=1), DIMENSION(:), allocatable :: Bytes
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER
+		INTEGER(kind=C_SIZE_T), intent(out) :: outSize !in bytes
+		
+		CALL checkFileSize(FILE_PATH, outSize)
+		allocate(temp(outSize))
+
+		CALL readByteFile(FILE_PATH, len(trim(FILE_PATH)), temp, outSize)
+		allocate(Bytes(outSize))
+		DO COUNTER=1,outSize,1
+			Bytes(COUNTER) = temp(COUNTER)
+		END DO
+		deallocate(temp)
+	END SUBROUTINE readByteData
+
+	SUBROUTINE readFloatData(FILE_PATH, VAR, nbEle)
+		implicit none
+		REAL(KIND=4), DIMENSION(:), allocatable :: temp
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER, fileSize
+		INTEGER(kind=C_SIZE_T), intent(out) :: nbEle
+
+		CALL checkFileSize(FILE_PATH, fileSize)
+		nbEle = fileSize/4
+		allocate(temp(nbEle))
+		
+		CALL readFloatFile(FILE_PATH, len(trim(FILE_PATH)), temp, nbEle)
+		allocate(VAR(nbEle))
+		DO COUNTER=1,fileSize,1
+			VAR(COUNTER) = temp(COUNTER)
+		END DO		
+		deallocate(temp)
+	END SUBROUTINE readFloatData
+
+	SUBROUTINE readDoubleData(FILE_PATH, VAR, nbEle)
+		implicit none
+		REAL(KIND=8), DIMENSION(:), allocatable :: temp
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER, fileSize
+		INTEGER(kind=C_SIZE_T), intent(out) :: nbEle
+
+		CALL checkFileSize(FILE_PATH, fileSize)
+		nbEle = fileSize/8
+		allocate(temp(nbEle))
+	
+		CALL readDoubleFile(FILE_PATH, len(trim(FILE_PATH)), temp, nbEle)
+		allocate(VAR(nbEle))
+		DO COUNTER=1,fileSize,1
+			VAR(COUNTER) = temp(COUNTER)
+		END DO		
+		deallocate(temp)		
+	END SUBROUTINE readDoubleData
+
+END MODULE RW
diff --git a/deps/SZ/sz/src/rwf.c b/deps/SZ/sz/src/rwf.c
new file mode 100644
index 0000000000000000000000000000000000000000..17e0fb4b40d31af385e3fdda0ee72f178605ec15
--- /dev/null
+++ b/deps/SZ/sz/src/rwf.c
@@ -0,0 +1,96 @@
+/**
+ *  @file rw.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief io interface for fortrance
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rw.h"
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize)
+{
+	int i; 
+	int status;
+	char s[*len+1];
+	for(i=0;i<*len;i++)
+		s[i]=srcFilePath[i];
+	s[*len]='\0';
+	*filesize = checkFileSize(s, &status);
+}
+
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+    unsigned char *tmp_bytes = readByteData(s, byteLength, &ierr);
+    memcpy(bytes, tmp_bytes, *byteLength);
+    free(tmp_bytes);
+}
+
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';	
+	double *tmp_data = readDoubleData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+	float *tmp_data = readFloatData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeByteData(bytes, *byteLength, s, &ierr);
+}
+
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i;
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';	
+	writeDoubleData(data, *nbEle, s, &ierr);
+}
+
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeFloatData(data, *nbEle, s, &ierr);
+}
diff --git a/deps/SZ/sz/src/sz.c b/deps/SZ/sz/src/sz.c
new file mode 100644
index 0000000000000000000000000000000000000000..0567a6fc4486bdced2180a8175e5fea292c5ce4e
--- /dev/null
+++ b/deps/SZ/sz/src/sz.c
@@ -0,0 +1,1353 @@
+/**
+ *  @file sz.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "TightDataPointStorageF.h"
+#include "zlib.h"
+#include "rw.h"
+#include "Huffman.h"
+#include "conf.h"
+#include "utility.h"
+#include "exafelSZ.h"
+//#include "CurveFillingCompressStorage.h"
+
+int versionNumber[4] = {SZ_VER_MAJOR,SZ_VER_MINOR,SZ_VER_BUILD,SZ_VER_REVISION};
+//int SZ_SIZE_TYPE = 8;
+
+int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
+int sysEndianType; //*sysEndianType is actually set automatically.
+
+//the confparams should be separate between compression and decopmression, in case of mutual-affection when calling compression/decompression alternatively
+sz_params *confparams_cpr = NULL; //used for compression
+sz_params *confparams_dec = NULL; //used for decompression 
+
+sz_exedata *exe_params = NULL;
+
+/*following global variables are desgined for time-series based compression*/
+/*sz_varset is not used in the single-snapshot data compression*/
+SZ_VarSet* sz_varset = NULL;
+sz_multisteps *multisteps = NULL;
+sz_tsc_metadata *sz_tsc = NULL;
+
+//only for Pastri compressor
+#ifdef PASTRI
+pastri_params pastri_par;
+#endif
+
+HuffmanTree* SZ_Reset()
+{
+	return createDefaultHuffmanTree();
+}
+
+int SZ_Init(const char *configFilePath)
+{
+	int loadFileResult = SZ_LoadConf(configFilePath);
+	if(loadFileResult==SZ_NSCS)
+		return SZ_NSCS;
+	
+	exe_params->SZ_SIZE_TYPE = sizeof(size_t);
+	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		initSZ_TSC();
+	}
+	return SZ_SCES;
+}
+
+int SZ_Init_Params(sz_params *params)
+{
+	SZ_Init(NULL);
+
+	if(params->losslessCompressor!=GZIP_COMPRESSOR && params->losslessCompressor!=ZSTD_COMPRESSOR)
+		params->losslessCompressor = ZSTD_COMPRESSOR;
+
+	if(params->max_quant_intervals > 0)
+		params->maxRangeRadius = params->max_quant_intervals/2;
+		
+	memcpy(confparams_cpr, params, sizeof(sz_params));
+
+	if(params->quantization_intervals%2!=0)
+	{
+		printf("Error: quantization_intervals must be an even number!\n");
+		return SZ_NSCS;
+	}
+
+	return SZ_SCES;
+}
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	int dimension;
+	if(r1==0) 
+	{
+		dimension = 0;
+	}
+	else if(r2==0) 
+	{
+		dimension = 1;
+	}
+	else if(r3==0) 
+	{
+		dimension = 2;
+	}
+	else if(r4==0) 
+	{
+		dimension = 3;
+	}
+	else if(r5==0) 
+	{
+		dimension = 4;
+	}
+	else 
+	{
+		dimension = 5;
+	}
+	return dimension;	
+}
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t dataLength;
+	if(r1==0) 
+	{
+		dataLength = 0;
+	}
+	else if(r2==0) 
+	{
+		dataLength = r1;
+	}
+	else if(r3==0) 
+	{
+		dataLength = r1*r2;
+	}
+	else if(r4==0) 
+	{
+		dataLength = r1*r2*r3;
+	}
+	else if(r5==0) 
+	{
+		dataLength = r1*r2*r3*r4;
+	}
+	else 
+	{
+		dataLength = r1*r2*r3*r4*r5;
+	}
+	return dataLength;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      Perform Compression 
+    @param      data           data to be compressed
+    @param      outSize        the size (in bytes) after compression
+    @param		r5,r4,r3,r2,r1	the sizes of each dimension (supporting only 5 dimensions at most in this version.
+    @return     compressed data (in binary stream) or NULL(0) if any errors
+
+ **/
+/*-------------------------------------------------------------------------*/
+unsigned char* SZ_compress_args(int dataType, void *data, size_t *outSize, int errBoundMode, double absErrBound, 
+double relBoundRatio, double pwrBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(confparams_cpr == NULL)
+		SZ_Init(NULL);
+	else if(exe_params == NULL)
+	{
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+		memset(exe_params, 0, sizeof(sz_exedata));
+	}
+	if(exe_params->intvCapacity == 0)
+	{
+		exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+		exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+		exe_params->optQuantMode = 1;		
+	}
+	
+	confparams_cpr->dataType = dataType;
+	if(dataType==SZ_FLOAT)
+	{
+		unsigned char *newByteData = NULL;
+		
+		SZ_compress_args_float(-1, confparams_cpr->withRegression, &newByteData, (float *)data, r5, r4, r3, r2, r1, 
+		outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio);
+		
+		return newByteData;
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_double(-1, confparams_cpr->withRegression, &newByteData, (double *)data, r5, r4, r3, r2, r1, 
+		outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio);
+		
+		return newByteData;
+	}
+	else if(dataType==SZ_INT64)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int64(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}		
+	else if(dataType==SZ_INT32) //int type
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int32(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_INT16)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int16(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;		
+	}
+	else if(dataType==SZ_INT8)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int8(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_UINT64)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint64(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}		
+	else if(dataType==SZ_UINT32) //int type
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint32(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_UINT16)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint16(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;		
+	}
+	else if(dataType==SZ_UINT8)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint8(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	} 	
+	else
+	{
+		printf("Error: dataType can only be SZ_FLOAT, SZ_DOUBLE, SZ_INT8/16/32/64 or SZ_UINT8/16/32/64.\n");
+		return NULL;
+	}
+}
+
+int SZ_compress_args2(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, 
+int errBoundMode, double absErrBound, double relBoundRatio, double pwrBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char* bytes = SZ_compress_args(dataType, data, outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio, r5, r4, r3, r2, r1);
+    memcpy(compressed_bytes, bytes, *outSize);
+    free(bytes); 
+	return SZ_SCES;
+}
+
+int SZ_compress_args3(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	confparams_cpr->dataType = dataType;
+	if(dataType==SZ_FLOAT)
+	{
+		SZ_compress_args_float_subblock(compressed_bytes, (float *)data, 
+		r5, r4, r3, r2, r1,
+		s5, s4, s3, s2, s1,
+		e5, e4, e3, e2, e1,
+		outSize, errBoundMode, absErrBound, relBoundRatio);
+		
+		return SZ_SCES;
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		SZ_compress_args_double_subblock(compressed_bytes, (double *)data, 
+		r5, r4, r3, r2, r1,
+		s5, s4, s3, s2, s1,
+		e5, e4, e3, e2, e1,
+		outSize, errBoundMode, absErrBound, relBoundRatio);
+		
+		return SZ_SCES;
+	}
+	else
+	{
+		printf("Error (in SZ_compress_args3): dataType can only be SZ_FLOAT or SZ_DOUBLE.\n");
+		return SZ_NSCS;
+	}	
+}
+
+unsigned char *SZ_compress(int dataType, void *data, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{	
+	unsigned char *newByteData = SZ_compress_args(dataType, data, outSize, confparams_cpr->errorBoundMode, confparams_cpr->absErrBound, confparams_cpr->relBoundRatio, 
+	confparams_cpr->pw_relBoundRatio, r5, r4, r3, r2, r1);
+	return newByteData;
+}
+
+//////////////////
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      Perform Compression 
+    @param      data           data to be compressed
+    @param		reservedValue  the reserved value
+    @param      outSize        the size (in bytes) after compression
+    @param		r5,r4,r3,r2,r1	the sizes of each dimension (supporting only 5 dimensions at most in this version.
+    @return     compressed data (in binary stream)
+
+ **/
+/*-------------------------------------------------------------------------*/
+unsigned char *SZ_compress_rev_args(int dataType, void *data, void *reservedValue, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char *newByteData;
+	//TODO
+	printf("SZ compression with reserved data is TO BE DONE LATER.\n");
+	exit(0);
+	
+	return newByteData;	
+}
+
+int SZ_compress_rev_args2(int dataType, void *data, void *reservedValue, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	confparams_cpr->dataType = dataType;
+	unsigned char* bytes = SZ_compress_rev_args(dataType, data, reservedValue, outSize, errBoundMode, absErrBound, relBoundRatio, r5, r4, r3, r2, r1);
+	memcpy(compressed_bytes, bytes, *outSize);
+	free(bytes); //free(bytes) is removed , because of dump error at MIRA system (PPC architecture), fixed?
+	return 0;
+}
+
+unsigned char *SZ_compress_rev(int dataType, void *data, void *reservedValue, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char *newByteData;
+	//TODO
+	printf("SZ compression with reserved data is TO BE DONE LATER.\n");
+	exit(0);
+	
+	return newByteData;
+}
+
+void *SZ_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	exe_params->SZ_SIZE_TYPE = 8;
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+	
+	if(dataType == SZ_FLOAT)
+	{
+		float *newFloatData;
+		SZ_decompress_args_float(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength, 0, NULL);
+		return newFloatData;	
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double *newDoubleData;
+		SZ_decompress_args_double(&newDoubleData, r5, r4, r3, r2, r1, bytes, byteLength, 0, NULL);
+		return newDoubleData;	
+	}
+	else if(dataType == SZ_INT8)
+	{
+		int8_t *newInt8Data;
+		SZ_decompress_args_int8(&newInt8Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt8Data;
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t *newInt16Data;
+		SZ_decompress_args_int16(&newInt16Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt16Data;
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int32_t *newInt32Data;
+		SZ_decompress_args_int32(&newInt32Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt32Data;
+	}
+	else if(dataType == SZ_INT64)
+	{
+		int64_t *newInt64Data;
+		SZ_decompress_args_int64(&newInt64Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt64Data;
+	}
+	else if(dataType == SZ_UINT8)
+	{
+		uint8_t *newUInt8Data;
+		SZ_decompress_args_uint8(&newUInt8Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt8Data;
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t *newUInt16Data;
+		SZ_decompress_args_uint16(&newUInt16Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt16Data;
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		uint32_t *newUInt32Data;
+		SZ_decompress_args_uint32(&newUInt32Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt32Data;
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		uint64_t *newUInt64Data;
+		SZ_decompress_args_uint64(&newUInt64Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt64Data;
+	}
+	else 
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		return NULL;	
+	}
+}
+
+/**
+ * 
+ * 
+ * return number of elements or -1 if any errors
+ * */
+size_t SZ_decompress_args(int dataType, unsigned char *bytes, size_t byteLength, void* decompressed_array, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	//size_t i;
+	size_t nbEle = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataType == SZ_FLOAT)
+	{
+		float* data = (float *)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		float* data_array = (float *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(float));
+		//for(i=0;i<nbEle;i++)
+		//	data_array[i] = data[i];	
+		free(data); //this free operation seems to not work with BlueG/Q system.	
+	}
+	else if (dataType == SZ_DOUBLE)
+	{
+		double* data = (double *)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		double* data_array = (double *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(double));
+		//for(i=0;i<nbEle;i++)
+		//	data_array[i] = data[i];
+		free(data); //this free operation seems to not work with BlueG/Q system.	
+	}
+	else if(dataType == SZ_INT8)
+	{
+		int8_t* data = (int8_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int8_t* data_array = (int8_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int8_t));
+		free(data);
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t* data = (int16_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int16_t* data_array = (int16_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int16_t));
+		free(data);	
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int32_t* data = (int32_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int32_t* data_array = (int32_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int32_t));
+		free(data);	
+	}
+	else if(dataType == SZ_INT64)
+	{
+		int64_t* data = (int64_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int64_t* data_array = (int64_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int64_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT8)
+	{
+		uint8_t* data = (uint8_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint8_t* data_array = (uint8_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint8_t));
+		free(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t* data = (uint16_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint16_t* data_array = (uint16_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint16_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		uint32_t* data = (uint32_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint32_t* data_array = (uint32_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint32_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		uint64_t* data = (uint64_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint64_t* data_array = (uint64_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint64_t));
+		free(data);			
+	}
+	else
+	{ 
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		return SZ_NSCS; //indicating error		
+	}
+
+	return nbEle;
+}
+
+
+sz_metadata* SZ_getMetadata(unsigned char* bytes)
+{
+	int index = 0, i, isConstant, isLossless;
+	size_t dataSeriesLength = 0;
+	int versions[3] = {0,0,0};
+	for (i = 0; i < 3; i++)
+		versions[i] = bytes[index++]; //3
+	unsigned char sameRByte = bytes[index++]; //1
+	isConstant = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	isLossless = (sameRByte & 0x10)>>4;
+	
+	int isRegressionBased = (sameRByte >> 7) & 0x01;
+	
+	if(exe_params==NULL)
+	{
+		exe_params = (sz_exedata *)malloc(sizeof(struct sz_exedata));
+		memset(exe_params, 0, sizeof(struct sz_exedata));
+	}
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	
+	convertBytesToSZParams(&(bytes[index]), confparams_dec);
+	/*sz_params* params = convertBytesToSZParams(&(bytes[index]));
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;*/	
+	if(confparams_dec->dataType==SZ_FLOAT)
+		index += MetaDataByteLength;
+	else if(confparams_dec->dataType==SZ_DOUBLE)
+		index += MetaDataByteLength_double;
+	
+	if(confparams_dec->dataType!=SZ_FLOAT && confparams_dec->dataType!= SZ_DOUBLE) //if this type is an Int type
+		index++; //jump to the dataLength info byte address
+	dataSeriesLength = bytesToSize(&(bytes[index]));// 4 or 8	
+	index += exe_params->SZ_SIZE_TYPE;
+	//index += 4; //max_quant_intervals
+
+	sz_metadata* metadata = (sz_metadata*)malloc(sizeof(struct sz_metadata));
+	
+	metadata->versionNumber[0] = versions[0];
+	metadata->versionNumber[1] = versions[1];
+	metadata->versionNumber[2] = versions[2];
+	metadata->isConstant = isConstant;
+	metadata->isLossless = isLossless;
+	metadata->sizeType = exe_params->SZ_SIZE_TYPE;
+	metadata->dataSeriesLength = dataSeriesLength;
+	
+	metadata->conf_params = confparams_dec;
+	
+	int defactoNBBins = 0; //real # bins
+	if(isConstant==0 && isLossless==0)
+	{
+		if(isRegressionBased==1)
+		{
+			unsigned char* raBytes = &(bytes[index]);
+			defactoNBBins = bytesToInt_bigEndian(raBytes + sizeof(int) + sizeof(double));
+		}
+		else
+		{
+			int radExpoL = 0, segmentL = 0, pwrErrBoundBytesL = 0;
+			if(metadata->conf_params->errorBoundMode >= PW_REL)
+			{
+				radExpoL = 1;
+				segmentL = exe_params->SZ_SIZE_TYPE;
+				pwrErrBoundBytesL = 4;
+			}
+			
+			int mdbl = confparams_dec->dataType==SZ_FLOAT?MetaDataByteLength:MetaDataByteLength_double;
+			int offset_typearray = 3 + 1 + mdbl + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrErrBoundBytesL + 4 + (4 + confparams_dec->dataType*4) + 1 + 8 
+					+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 4;
+			defactoNBBins = bytesToInt_bigEndian(bytes+offset_typearray);			
+		}
+
+	}	
+	
+	metadata->defactoNBBins = defactoNBBins;
+	return metadata;
+}
+
+void SZ_printMetadata(sz_metadata* metadata)
+{
+	printf("=================SZ Compression Meta Data=================\n");
+	printf("Version:                        \t %d.%d.%d\n", metadata->versionNumber[0], metadata->versionNumber[1], metadata->versionNumber[2]);
+	printf("Constant data?:                 \t %s\n", metadata->isConstant==1?"YES":"NO");
+	printf("Lossless?:                      \t %s\n", metadata->isLossless==1?"YES":"NO");
+	printf("Size type (size of # elements): \t %d bytes\n", metadata->sizeType); 
+	printf("Num of elements:                \t %zu\n", metadata->dataSeriesLength);
+		
+	sz_params* params = metadata->conf_params;
+
+	if(params->sol_ID == SZ)
+		printf("compressor Name: 		\t SZ\n");
+	else if(params->sol_ID == SZ_Transpose)
+		printf("compressor Name: 		\t SZ_Transpose\n");
+	else
+		printf("compressor Name: 		\t Other compressor\n");
+	switch(params->dataType)
+	{
+	case SZ_FLOAT:
+		printf("Data type:                      \t FLOAT\n");
+		printf("min value of raw data:          \t %f\n", params->fmin);
+		printf("max value of raw data:          \t %f\n", params->fmax);		
+		break;
+	case SZ_DOUBLE:
+		printf("Data type:                      \t DOUBLE\n");
+		printf("min value of raw data:          \t %f\n", params->dmin);
+		printf("max value of raw data:          \t %f\n", params->dmax);	
+		break;
+	case SZ_INT8:
+		printf("Data type:                      \t INT8\n");
+		break;	
+	case SZ_INT16:
+		printf("Data type:                      \t INT16\n");
+		break;
+	case SZ_INT32:
+		printf("Data type:                      \t INT32\n");
+		break;	
+	case SZ_INT64:
+		printf("Data type:                      \t INT64\n");
+		break;	
+	case SZ_UINT8:
+		printf("Data type:                      \t UINT8\n");
+		break;	
+	case SZ_UINT16:
+		printf("Data type:                      \t UINT16\n");
+		break;
+	case SZ_UINT32:
+		printf("Data type:                      \t UINT32\n");
+		break;	
+	case SZ_UINT64:
+		printf("Data type:                      \t UINT64\n");
+		break;				
+	}
+	
+	if(exe_params->optQuantMode==1)
+	{
+		printf("quantization_intervals:         \t 0\n");
+		printf("max_quant_intervals:            \t %d\n", params->max_quant_intervals);
+		printf("actual used # intervals:        \t %d\n", metadata->defactoNBBins);
+	}
+	else
+	{
+		printf("quantization_intervals:         \t %d\n", params->quantization_intervals);
+		printf("max_quant_intervals:            \t - %d\n", params->max_quant_intervals);		
+	}
+	
+	printf("dataEndianType (prior raw data):\t %s\n", dataEndianType==BIG_ENDIAN_DATA?"BIG_ENDIAN":"LITTLE_ENDIAN");
+	printf("sysEndianType (at compression): \t %s\n", sysEndianType==1?"BIG_ENDIAN":"LITTLE_ENDIAN");
+	printf("sampleDistance:                 \t %d\n", params->sampleDistance);
+	printf("predThreshold:                  \t %f\n", params->predThreshold);
+	switch(params->szMode)
+	{
+	case SZ_BEST_SPEED:
+		printf("szMode:                         \t SZ_BEST_SPEED (without Gzip)\n");
+		break;
+	case SZ_BEST_COMPRESSION:
+		printf("szMode:                         \t SZ_BEST_COMPRESSION (with Zstd or Gzip)\n");
+		break;
+	}
+	switch(params->gzipMode)
+	{
+	case Z_BEST_SPEED:
+		printf("gzipMode:                       \t Z_BEST_SPEED\n");
+		break;
+	case Z_DEFAULT_COMPRESSION:
+		printf("gzipMode:                       \t Z_BEST_SPEED\n");
+		break;	
+	case Z_BEST_COMPRESSION:
+		printf("gzipMode:                       \t Z_BEST_COMPRESSION\n");
+		break;
+	}
+	
+	switch(params->errorBoundMode)
+	{
+	case ABS:
+		printf("errBoundMode:                   \t ABS\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case REL:
+		printf("errBoundMode:                   \t REL (based on value_range extent)\n");
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case ABS_AND_REL:
+		printf("errBoundMode:                   \t ABS_AND_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case ABS_OR_REL:
+		printf("errBoundMode:                   \t ABS_OR_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case PSNR:
+		printf("errBoundMode:                   \t PSNR\n");
+		printf("psnr:                           \t %f\n", params->psnr);
+		break;
+	case PW_REL:
+		printf("errBoundMode:                   \t PW_REL\n");
+		break;
+	case ABS_AND_PW_REL:
+		printf("errBoundMode:                   \t ABS_AND_PW_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case ABS_OR_PW_REL:
+		printf("errBoundMode:                   \t ABS_OR_PW_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case REL_AND_PW_REL:
+		printf("errBoundMode:                   \t REL_AND_PW_REL\n");
+		printf("range_relBoundRatio:            \t %f\n", params->relBoundRatio);
+		break;
+	case REL_OR_PW_REL:
+		printf("errBoundMode:                   \t REL_OR_PW_REL\n");
+		printf("range_relBoundRatio:            \t %f\n", params->relBoundRatio);
+		break;
+	}
+	
+	if(params->errorBoundMode>=PW_REL && params->errorBoundMode<=REL_OR_PW_REL)
+	{
+		printf("pw_relBoundRatio:               \t %f\n", params->pw_relBoundRatio);
+		//printf("segment_size:                   \t %d\n", params->segment_size);
+		switch(params->pwr_type)
+		{
+		case SZ_PWR_MIN_TYPE:
+			printf("pwrType:                    \t SZ_PWR_MIN_TYPE\n");
+			break;
+		case SZ_PWR_AVG_TYPE:
+			printf("pwrType:                    \t SZ_PWR_AVG_TYPE\n");
+			break;
+		case SZ_PWR_MAX_TYPE:
+			printf("pwrType:                    \t SZ_PWR_MAX_TYPE\n");
+			break;
+		}
+	}
+}
+
+/*-----------------------------------batch data compression--------------------------------------*/
+
+void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(r2==0)
+		dim[0] = r1;
+	else if(r3==0)
+	{
+		dim[0] = r2;
+		dim[1] = r1;
+	}
+	else if(r4==0)
+	{
+		dim[0] = r3;
+		dim[1] = r2;
+		dim[2] = r1;
+	}
+	else if(r5==0)
+	{
+		dim[0] = r4;
+		dim[1] = r3;
+		dim[2] = r2;
+		dim[3] = r1;
+	}
+	else
+	{
+		dim[0] = r5;
+		dim[1] = r4;
+		dim[2] = r3;
+		dim[3] = r2;
+		dim[4] = r1;		
+	}
+}
+
+size_t compute_total_batch_size()
+{
+	size_t eleNum = 0, totalSize = 0;
+	SZ_Variable* p = sz_varset->header;
+	while(p->next!=NULL)
+	{
+		eleNum = computeDataLength(p->next->r5, p->next->r4, p->next->r3, p->next->r2, p->next->r1);
+		if(p->next->dataType==SZ_FLOAT)
+			totalSize += (eleNum*4);
+		else
+			totalSize += (eleNum*8);
+		p=p->next;
+	}
+	return totalSize;
+}
+
+void SZ_registerVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(sz_tsc==NULL)
+		initSZ_TSC();
+		
+	//char str[256];
+	SZ_batchAddVar(var_id, varName, dataType, data, 
+			errBoundMode, absErrBound, relBoundRatio, pwRelBoundRatio, r5, r4, r3, r2, r1);
+	//sprintf(str, "%d: %s : %zuX%zuX%zuX%zu%zu : %d : %f : %f : %f\n", sz_varset->count - 1, varName, r5, r4, r3, r2, r1, errBoundMode, absErrBound, relBoundRatio, pwRelBoundRatio);
+	//fputs(str, sz_tsc->metadata_file);
+}
+
+int SZ_deregisterVar_ID(int var_id)
+{
+	int state = SZ_batchDelVar_ID(var_id);
+	return state;
+}
+
+int SZ_deregisterVar(char* varName)
+{
+	int state = SZ_batchDelVar(varName);
+	return state;
+}
+
+#ifdef HAVE_TIMECMPR
+/**
+ * process multiple variables
+ * */
+int SZ_compress_ts_select_var(int cmprType, unsigned char* var_ids, unsigned char var_count, unsigned char** newByteData, size_t *outSize)
+{
+	confparams_cpr->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_cpr->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	SZ_VarSet* vset = sz_varset;
+	int i = 0, j = 0, totalSize = 0;	
+
+	SZ_Variable* vp[256];
+
+	SZ_Variable* v = vset->header->next;	
+	for(i = 0;i<vset->count;i++)
+	{
+		int found = checkVarID(v->var_id, var_ids, var_count);
+		if (found)
+		{
+			multisteps = v->multisteps;
+			if(v->dataType==SZ_FLOAT)
+			{
+				SZ_compress_args_float(cmprType, confparams_cpr->withRegression, &(v->compressedBytes), (float*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &(v->compressedSize), v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+			}
+			else if(v->dataType==SZ_DOUBLE)
+			{
+				SZ_compress_args_double(cmprType, confparams_cpr->withRegression, &(v->compressedBytes), (double*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &(v->compressedSize), v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+			}
+		
+			totalSize += v->compressedSize;
+			v->compressType = multisteps->compressionType;
+			vp[j] = v;
+			j++;
+		}
+		v = v->next;
+	}
+	
+	*outSize = sizeof(int) + sizeof(unsigned short) + totalSize + var_count*(3*sizeof(unsigned char)+sizeof(size_t));
+	*newByteData = (unsigned char*)malloc(*outSize); 
+	unsigned char* p = *newByteData;
+
+	intToBytes_bigEndian(p, sz_tsc->currentStep);
+	p+=4;
+	shortToBytes(p, var_count);
+	p+=2;
+
+	for(i=0;i<var_count;i++)
+	{
+		v = vp[i];
+		*p = v->var_id; //1 byte
+		p++;
+		*p = (unsigned char)v->compressType; //1 byte
+		p++;
+		*p = (unsigned char)v->dataType; //1 byte
+		p++;
+		sizeToBytes(p, v->compressedSize); //size_t
+		p += sizeof(size_t);							
+		memcpy(p, v->compressedBytes, v->compressedSize); //outSize_[i]
+		p += v->compressedSize;
+	}
+
+	sz_tsc->currentStep ++;	
+	
+	return SZ_SCES;	
+}
+
+/**
+ * process all variables
+ * */
+int SZ_compress_ts(int cmprType, unsigned char** newByteData, size_t *outSize)
+{
+	confparams_cpr->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_cpr->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	SZ_VarSet* vset = sz_varset;
+	
+	//char *metadata_str = (char*)malloc(vset->count*256);
+	//memset(metadata_str, 0, vset->count*256);
+	//sprintf(metadata_str, "step %d", sz_tsc->currentStep);
+	
+	int i = 0, totalSize = 0;
+	
+	SZ_Variable* v = vset->header->next;	
+	for(i=0;i<vset->count;i++)
+	{
+		multisteps = v->multisteps; //assign the v's multisteps to the global variable 'multisteps', which will be used in the following compression.
+
+		if(v->dataType==SZ_FLOAT)
+		{
+			SZ_compress_args_float(cmprType, confparams_cpr->withRegression, &(v->compressedBytes), (float*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &(v->compressedSize), v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+		}
+		else if(v->dataType==SZ_DOUBLE)
+		{
+			SZ_compress_args_double(cmprType, confparams_cpr->withRegression, &(v->compressedBytes), (double*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &(v->compressedSize), v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+		}
+		//sprintf(metadata_str, "%s:%d,%d,%zu", metadata_str, i, multisteps->lastSnapshotStep, outSize_[i]);
+		
+		totalSize += v->compressedSize;
+		v->compressType = multisteps->compressionType;
+		v = v->next;
+	}
+	
+	//sprintf(metadata_str, "%s\n", metadata_str);
+	//fputs(metadata_str, sz_tsc->metadata_file);
+	//free(metadata_str);
+	
+	//sizeof(int)==current time step; 2*sizeof(char)+sizeof(size_t)=={compressionType + datatype + compression_data_size}; 
+	//sizeof(char)==# variables
+	*outSize = sizeof(int) + sizeof(unsigned short) + totalSize + vset->count*(3*sizeof(unsigned char)+sizeof(size_t));
+	*newByteData = (unsigned char*)malloc(*outSize); 
+	unsigned char* p = *newByteData;
+
+	intToBytes_bigEndian(p, sz_tsc->currentStep);
+	p+=4;
+	shortToBytes(p, vset->count);
+	p+=2;
+	
+	v = vset->header->next;
+
+	for(i=0;i<vset->count;i++)
+	{
+		*p = v->var_id; //1 byte
+		p++;
+		*p = (unsigned char)v->compressType; //1 byte
+		p++;
+		*p = (unsigned char)v->dataType; //1 byte
+		p++;
+		sizeToBytes(p, v->compressedSize); //size_t
+		p += sizeof(size_t);
+		
+		memcpy(p, v->compressedBytes, v->compressedSize); //outSize_[i]
+		p += v->compressedSize;
+		v = v->next;
+	}
+
+	sz_tsc->currentStep ++;	
+	//free(outSize_);
+	
+	return SZ_SCES;
+}
+
+void SZ_decompress_ts(unsigned char *bytes, size_t bytesLength)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_dec->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+	
+	int i = 0;
+	size_t r5 = 0, r4 = 0, r3 = 0, r2 = 0, r1 = 0;
+	unsigned char* q = bytes;
+	sz_tsc->currentStep = bytesToInt_bigEndian(q); 
+	q += 4;
+	unsigned short nbVars = (unsigned short)bytesToShort(q);
+	q += 2;
+	
+	float *newFloatData = NULL;
+	double *newDoubleData = NULL;	
+	
+	for(i=0;i<nbVars;i++)
+	{
+		unsigned char var_id = *(q++);
+		SZ_Variable* p = SZ_getVariable(var_id);
+		sz_multisteps* multisteps = p->multisteps;
+		multisteps->compressionType = *(q++);
+		unsigned char dataType = *(q++);
+		size_t cmpSize = bytesToSize(q);
+		q += sizeof(size_t);
+		
+		if(p==NULL)
+			q += cmpSize;
+		else
+		{
+			sz_multisteps* multisteps = p->multisteps;
+			r5 = p->r5;
+			r4 = p->r4;
+			r3 = p->r3;
+			r2 = p->r2;
+			r1 = p->r1;
+			size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);				
+			
+			unsigned char* cmpBytes = q;			
+			switch(dataType)
+			{
+			case SZ_FLOAT:
+					SZ_decompress_args_float(&newFloatData, r5, r4, r3, r2, r1, cmpBytes, cmpSize, multisteps->compressionType, multisteps->hist_data);
+					memcpy(p->data, newFloatData, dataLen*sizeof(float));
+					free(newFloatData);
+					break;
+			case SZ_DOUBLE:
+					SZ_decompress_args_double(&newDoubleData, r5, r4, r3, r2, r1, cmpBytes, cmpSize, multisteps->compressionType, multisteps->hist_data);
+					memcpy(p->data, newDoubleData, dataLen*sizeof(double));
+					free(newDoubleData);
+					break;
+			default:
+					printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+					return;	
+			}
+			
+			q += cmpSize;			
+		}
+	}	
+}
+
+void SZ_decompress_ts_select_var(unsigned char* var_ids, unsigned char var_count, unsigned char *bytes, size_t bytesLength)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_dec->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+	
+	int i = 0;
+	size_t r5 = 0, r4 = 0, r3 = 0, r2 = 0, r1 = 0;
+	unsigned char* q = bytes;
+	sz_tsc->currentStep = bytesToInt_bigEndian(q); 
+	q += 4;
+	unsigned short nbVars = (unsigned short)bytesToShort(q);
+	q += 2;
+	
+	float *newFloatData = NULL;
+	double *newDoubleData = NULL;	
+	
+	for(i=0;i<nbVars;i++)
+	{
+		unsigned char var_id = *(q++);
+		int selected = checkVarID(var_id, var_ids, var_count);
+		SZ_Variable* p = SZ_getVariable(var_id);
+		sz_multisteps* multisteps = p->multisteps;
+		multisteps->compressionType = *(q++);
+		unsigned char dataType = *(q++);
+		size_t cmpSize = bytesToSize(q);
+		q += sizeof(size_t);
+		
+		if(p==NULL || selected == 0) //p==NULL means the variable was not registered during compression ; selected==0 means that the variable is not selected
+			q += cmpSize;
+		else // p!=NULL && selected == 1
+		{
+			sz_multisteps* multisteps = p->multisteps;
+			r5 = p->r5;
+			r4 = p->r4;
+			r3 = p->r3;
+			r2 = p->r2;
+			r1 = p->r1;
+			size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);				
+			
+			unsigned char* cmpBytes = q;			
+			switch(dataType)
+			{
+			case SZ_FLOAT:
+					SZ_decompress_args_float(&newFloatData, r5, r4, r3, r2, r1, cmpBytes, cmpSize, multisteps->compressionType, multisteps->hist_data);
+					memcpy(p->data, newFloatData, dataLen*sizeof(float));
+					free(newFloatData);
+					break;
+			case SZ_DOUBLE:
+					SZ_decompress_args_double(&newDoubleData, r5, r4, r3, r2, r1, cmpBytes, cmpSize, multisteps->compressionType, multisteps->hist_data);
+					memcpy(p->data, newDoubleData, dataLen*sizeof(double));
+					free(newDoubleData);
+					break;
+			default:
+					printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+					return;	
+			}
+			
+			q += cmpSize;			
+		}
+	}	
+}
+#endif
+
+
+void SZ_Finalize()
+{
+#ifdef HAVE_TIMECMPR		
+	if(sz_varset!=NULL)
+		SZ_freeVarSet(SZ_MAINTAIN_VAR_DATA);
+#endif
+
+	if(confparams_dec!=NULL)
+	{
+		free(confparams_dec);
+		confparams_dec = NULL;
+	}
+	if(confparams_cpr!=NULL)
+	{
+		free(confparams_cpr);
+		confparams_cpr = NULL;
+	}	
+	if(exe_params!=NULL)
+	{
+		free(exe_params);
+		exe_params = NULL;
+	}
+	
+//#ifdef HAVE_TIMECMPR	
+//	if(sz_tsc!=NULL && sz_tsc->metadata_file!=NULL)
+//		fclose(sz_tsc->metadata_file);
+//#endif
+}
+
+
+/**
+ *
+ * Inits the compressor for SZ_compress_customize
+ *
+ * with SZ_Init(NULL) if not previously initialized and no params passed
+ * with SZ_InitParam(userPara) otherwise if params are passed
+ * and doesn't not initialize otherwise
+ *
+ * @param sz_params* userPara : the user configuration or null
+ * @param sz_params* confparams : the current configuration
+ */
+static void sz_maybe_init_with_user_params(struct sz_params* userPara, struct sz_params* current_params) {
+		if(userPara==NULL && current_params == NULL)
+			SZ_Init(NULL);
+		else if(userPara != NULL)
+			SZ_Init_Params((sz_params*)userPara);
+}
+
+
+/**
+ * 
+ * The interface for the user-customized compression method 
+ * 
+ * @param char* comprName : the name of the specific compression approach
+ * @param void* userPara : the pointer of the user-customized data stracture containing the cusotmized compressors' requried input parameters
+ * @param int dataType : data type (SZ_FLOAT, SZ_DOUBLE, SZ_INT8, SZ_UINT8, SZ_INT16, SZ_UINT16, ....)
+ * @param void* data : input dataset
+ * @param size_t r5 : the size of dimension 5
+ * @param size_t r4 : the size of dimension 4
+ * @param size_t r3 : the size of dimension 3
+ * @param size_t r2 : the size of dimension 2
+ * @param size_t r1 : the size of dimension 1
+ * @param size_t outSize : the number of bytes after compression
+ * @param int *status : the execution status of the compression operation (success: SZ_SCES or fail: SZ_NSCS)
+ * 
+ * */
+unsigned char* SZ_compress_customize(const char* cmprName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status)
+{
+	unsigned char* result = NULL;
+	if(strcmp(cmprName, "SZ2.0")==0 || strcmp(cmprName, "SZ2.1")==0 || strcmp(cmprName, "SZ")==0)
+	{
+		sz_maybe_init_with_user_params(userPara, confparams_cpr);
+		result = SZ_compress(dataType, data, outSize, r5, r4, r3, r2, r1);
+		*status = SZ_SCES;
+	}
+	else if(strcmp(cmprName, "SZ1.4")==0)
+	{
+		sz_maybe_init_with_user_params(userPara, confparams_cpr);
+		confparams_cpr->withRegression = SZ_NO_REGRESSION;
+		
+		result = SZ_compress(dataType, data, outSize, r5, r4, r3, r2, r1);
+		*status = SZ_SCES;		
+    }
+    else if(strcmp(cmprName, "SZ_Transpose")==0)
+    {
+		void* transData = transposeData(data, dataType, r5, r4, r3, r2, r1);
+		sz_maybe_init_with_user_params(userPara, confparams_cpr);
+		size_t n = computeDataLength(r5, r4, r3, r2, r1);
+		result = SZ_compress(dataType, transData, outSize, 0, 0, 0, 0, n);
+	}
+    else if(strcmp(cmprName, "ExaFEL")==0){
+    	assert(dataType==SZ_FLOAT);
+    	assert(r5==0);
+    	result = exafelSZ_Compress(userPara,data, r4, r3, r2, r1,outSize);
+    	*status = SZ_SCES;
+	}
+	else
+	{
+		*status = SZ_NSCS;
+	}
+	return result;
+}
+
+unsigned char* SZ_compress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status)
+{
+	unsigned char* result = NULL;
+	if(strcmp(cmprName, "SZ2.0")==0 || strcmp(cmprName, "SZ2.1")==0 || strcmp(cmprName, "SZ")==0)
+	{
+		SZ_Init(NULL);
+		struct sz_params* para = (struct sz_params*)userPara;
+		
+		if(dataType==SZ_FLOAT)
+		{	
+			SZ_compress_args_float(-1, SZ_WITH_LINEAR_REGRESSION, &result, (float *)data, r5, r4, r3, r2, r1, 
+			outSize, para->errorBoundMode, para->absErrBound, para->relBoundRatio, para->pw_relBoundRatio);
+		}
+		else if(dataType==SZ_DOUBLE)
+		{
+			SZ_compress_args_double(-1, SZ_WITH_LINEAR_REGRESSION, &result, (double *)data, r5, r4, r3, r2, r1, 
+			outSize, para->errorBoundMode, para->absErrBound, para->relBoundRatio, para->pw_relBoundRatio);
+		}		
+
+		*status = SZ_SCES;
+		return result;
+	}
+	else if(strcmp(cmprName, "SZ1.4")==0)
+	{
+		SZ_Init(NULL);
+		struct sz_params* para = (struct sz_params*)userPara;
+		
+		if(dataType==SZ_FLOAT)
+		{	
+			SZ_compress_args_float(-1, SZ_NO_REGRESSION, &result, (float *)data, r5, r4, r3, r2, r1, 
+			outSize, para->errorBoundMode, para->absErrBound, para->relBoundRatio, para->pw_relBoundRatio);
+		}
+		else if(dataType==SZ_DOUBLE)
+		{
+			SZ_compress_args_double(-1, SZ_NO_REGRESSION, &result, (double *)data, r5, r4, r3, r2, r1, 
+			outSize, para->errorBoundMode, para->absErrBound, para->relBoundRatio, para->pw_relBoundRatio);
+		}		
+
+		*status = SZ_SCES;
+		return result;
+    }
+    else if(strcmp(cmprName, "SZ_Transpose")==0)
+    {
+		void* transData = transposeData(data, dataType, r5, r4, r3, r2, r1);
+		struct sz_params* para = (struct sz_params*)userPara;
+	
+		size_t n = computeDataLength(r5, r4, r3, r2, r1);
+		
+		result = SZ_compress_args(dataType, transData, outSize, para->errorBoundMode, para->absErrBound, para->relBoundRatio, para->pw_relBoundRatio, 0, 0, 0, 0, n);
+		
+		*status = SZ_SCES;
+	}
+    else if(strcmp(cmprName, "ExaFEL")==0){  //not sure if this part is thread safe!
+    	assert(dataType==SZ_FLOAT);
+    	assert(r5==0);
+    	result = exafelSZ_Compress(userPara,data, r4, r3, r2, r1,outSize);
+    	*status = SZ_SCES;
+	}
+	else
+	{
+		*status = SZ_NSCS;
+	}
+	return result;
+}
+
+
+/**
+ * 
+ * The interface for the user-customized decompression method 
+ * 
+ * @param char* comprName : the name of the specific compression approach
+ * @param void* userPara : the pointer of the user-customized data stracture containing the cusotmized compressors' requried input parameters
+ * @param int dataType : data type (SZ_FLOAT, SZ_DOUBLE, SZ_INT8, SZ_UINT8, SZ_INT16, SZ_UINT16, ....)
+ * @param unsigned char* bytes : input bytes (the compressed data)
+ * @param size_t r5 : the size of dimension 5
+ * @param size_t r4 : the size of dimension 4
+ * @param size_t r3 : the size of dimension 3
+ * @param size_t r2 : the size of dimension 2
+ * @param size_t r1 : the size of dimension 1
+ * @param int *status : the execution status of the compression operation (success: SZ_SCES or fail: SZ_NSCS)
+ * 
+ * */
+void* SZ_decompress_customize(const char* cmprName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int *status)
+{
+	void* result = NULL;
+	if(strcmp(cmprName, "SZ2.0")==0 || strcmp(cmprName, "SZ")==0 || strcmp(cmprName, "SZ1.4")==0)
+	{
+		result = SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		* status = SZ_SCES;
+	}
+    else if(strcmp(cmprName, "SZ_Transpose")==0)
+    {
+		size_t n = computeDataLength(r5, r4, r3, r2, r1);
+		void* tmpData = SZ_decompress(dataType, bytes, byteLength, 0, 0, 0, 0, n);
+		result = detransposeData(tmpData, dataType, r5, r4, r3, r2, r1);
+	}
+  	else if(strcmp(cmprName, "ExaFEL")==0){
+    	assert(dataType==SZ_FLOAT);
+   		assert(r5==0);
+    	result = exafelSZ_Decompress(userPara,bytes, r4, r3, r2, r1,byteLength);
+    	*status = SZ_SCES;
+	}
+	else
+	{
+		*status = SZ_NSCS;
+	}
+	return result;	
+}
+
+
+void* SZ_decompress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int *status)
+{
+	return SZ_decompress_customize(cmprName, userPara, dataType, bytes, byteLength, r5, r4, r3, r2, r1, status);
+}
diff --git a/deps/SZ/sz/src/sz_double.c b/deps/SZ/sz/src/sz_double.c
new file mode 100644
index 0000000000000000000000000000000000000000..483f6fb7eceeb83371c18a0c5bec94820921638d
--- /dev/null
+++ b/deps/SZ/sz/src/sz_double.c
@@ -0,0 +1,6836 @@
+/**
+ *  @file sz_double.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "sz_double.h"
+#include "sz_double_pwr.h"
+#include "szd_double.h"
+#include "szd_double_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_double_ts.h"
+#include "utility.h"
+#include "CacheTable.h"
+#include "MultiLevelCacheTableWideInterval.h"
+#include "sz_stats.h"
+
+unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize)
+{
+	*outSize = dataLength*sizeof(double);
+	unsigned char* out = (unsigned char*)malloc(dataLength*sizeof(double));
+	memcpy(out, data, dataLength*sizeof(double));
+	return out;
+}
+
+inline void computeReqLength_double(double realPrecision, short radExpo, int* reqLength, double* medianValue)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	*reqLength = 12+radExpo - reqExpo; //radExpo-reqExpo == reqMantiLength
+	if(*reqLength<12)
+		*reqLength = 12;
+	if(*reqLength>64)
+	{
+		*reqLength = 64;
+		*medianValue = 0;
+	}
+}
+
+inline short computeReqLength_double_MSST19(double realPrecision)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	return 12-reqExpo;
+}
+
+unsigned int optimize_intervals_double_1D(double *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D(double *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;	
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}				
+			}
+			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_4D(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_opt(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);	
+	int intvRadius = quantization_intervals/2;
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));			
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	double checkRadius;
+	double curData;
+	double pred = last3CmprsData[0];
+	double predAbsErr;
+	checkRadius = (quantization_intervals-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	double recip_realPrecision = 1/realPrecision;
+	for(i=2;i<dataLength;i++)
+	{				
+		//printf("%.30G\n",last3CmprsData[0]);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		//pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr*recip_realPrecision+1)*0.5;
+			if(curData>=pred)
+			{
+				type[i] = intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = intvRadius-state;
+				pred = pred - state*interval;
+			}
+			//listAdd_double(last3CmprsData, pred);
+#ifdef HAVE_TIMECMPR					
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[i] = pred;			
+#endif	
+			continue;
+		}
+		
+		//unpredictable data processing
+		type[i] = 0;		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+							
+		//listAdd_double(last3CmprsData, vce->data);
+		pred = vce->data;
+		
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;	
+}
+
+void SZ_compress_args_double_StoreOriData(double* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize)
+{	
+	int doubleSize = sizeof(double);
+	size_t k = 0, i;
+	size_t totalByteLength = 3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1 + doubleSize*dataLength;
+	/*No need to malloc because newByteData should always already be allocated with no less totalByteLength.*/
+	//*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength_double;
+
+	sizeToBytes(dsLengthBytes,dataLength);
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//ST: 4 or 8
+		(*newByteData)[k++] = dsLengthBytes[i];
+
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE, oriData, dataLength*doubleSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=doubleSize)
+			doubleToBytes(p, oriData[i]);
+	}
+	*outSize = totalByteLength;
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, double *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{	
+				tdps = SZ_compress_double_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_d);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}					
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			tdps = SZ_compress_double_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 			
+		}
+
+	}
+	else
+#endif
+		tdps = SZ_compress_double_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_d);			
+	
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	if(*outSize>3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+	
+	free_TightDataPointStorageD(tdps);	
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR	
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+	
+	double recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_opt(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+	
+	size_t i,j; 
+	int reqLength;
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (double*)malloc(r2*sizeof(double));
+	memset(P0, 0, r2*sizeof(double));
+	P1 = (double*)malloc(r2*sizeof(double));
+	memset(P1, 0, r2*sizeof(double));
+		
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)*recip_realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + intvRadius;
+			P1[1] = pred1D + 2 * (type[1] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+		
+	if(r2!=1)	
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+
+	for(i=0;i<dataLength;i++)
+		printf("%d ", type[i]);
+	printf("\n");*/
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+char SZ_compress_args_double_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	size_t dataLength = r1*r2;
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{	
+				tdps = SZ_compress_double_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_d);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}					
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			tdps = SZ_compress_double_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 			
+		}
+	}
+	else
+#endif
+		tdps = SZ_compress_double_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_d);	
+	
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	if(*outSize>3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);	
+	
+	free_TightDataPointStorageD(tdps);
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif		
+
+	double recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_opt(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;	
+		
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t dataLength = r1*r2*r3;
+
+	size_t r23 = r2*r3;
+
+	P0 = (double*)malloc(r23*sizeof(double));
+	P1 = (double*)malloc(r23*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n",
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;	
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	size_t dataLength = r1*r2*r3;
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{	
+				if(confparams_cpr->withRegression == SZ_NO_REGRESSION)	
+					tdps = SZ_compress_double_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_d);
+				else
+					*newByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			if(confparams_cpr->withRegression == SZ_NO_REGRESSION)	
+				tdps = SZ_compress_double_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_d);
+			else
+				*newByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 			
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_double_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_d);		
+	
+	if(tdps!=NULL)
+	{
+		convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+		if(*outSize>3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+			SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+		free_TightDataPointStorageD(tdps);
+	}	
+	
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+	double recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t dataLength = r1*r2*r3*r4;
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (double*)malloc(r34*sizeof(double));
+	P1 = (double*)malloc(r34*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = fabs(diff)*recip_realPrecision + 1;
+
+					if (itvNum < quantization_intervals)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+						compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,8);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			double *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_4D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, medianValue_d);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+	return 0;
+}
+
+/*MSST19*/
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_MSST19(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+
+	//struct ClockPoint clockPointBuild;
+	//TimeDurationStart("build", &clockPointBuild);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_opt_MSST19(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);
+	int intvRadius = quantization_intervals/2;
+	
+	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+    for(int i=0; i<quantization_intervals; i++){
+        double test = pow((1+realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+    
+	struct TopLevelTableWideInterval levelTable;
+    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_f;
+	//double medianInverse = 1 / medianValue_f;
+	//short radExpo = getExponent_double(realPrecision);
+	
+	reqLength = computeReqLength_double_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, dataLength/2/8);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, dataLength/2);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[8];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	//size_t miss=0, hit=0;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[1], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	//double checkRadius;
+	double curData;
+	double pred = vce->data;
+
+    double predRelErrRatio;
+
+	const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+	const uint64_t range = top - base;
+	const int bits = levelTable.bits;
+	uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+	const int shift = 52-bits;
+	uint64_t expoIndex, mantiIndex;
+	uint16_t* tables[range+1];
+	for(int i=0; i<=range; i++){
+		tables[i] = levelTable.subTables[i].table;
+	}
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		predRelErrRatio = curData / pred;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if(state)
+		{
+			type[i] = state;
+			pred *= precisionTable[state];
+			//hit++;
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		pred =  vce->data;
+		//miss++;
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+//	printf("miss:%d, hit:%d\n", miss, hit);
+
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+    tdps->plus_bits = confparams_cpr->plus_bits;
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);
+	return tdps;
+}
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_f)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_opt_MSST19(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+	for(int i=0; i<quantization_intervals; i++){
+		double test = pow((1+realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+	//double smallest_precision = precisionTable[0], largest_precision = precisionTable[quantization_intervals-1];
+	struct TopLevelTableWideInterval levelTable;
+	MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+	size_t i,j; 
+	int reqLength;
+	double pred1D, pred2D;
+	//double diff = 0.0;
+	//double itvNum = 0;
+	double *P0, *P1;
+	double predRelErrRatio;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (double*)malloc(r2*sizeof(double));
+	memset(P0, 0, r2*sizeof(double));
+	P1 = (double*)malloc(r2*sizeof(double));
+	memset(P1, 0, r2*sizeof(double));
+		
+	double medianValue = medianValue_f;
+	//double medianValueInverse = 1 / medianValue_f;
+	//short radExpo = getExponent_double(valueRangeSize/2);
+	reqLength = computeReqLength_double_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	unsigned char preDataBytes[8];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+    const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+    const uint64_t range = top - base;
+    const int bits = levelTable.bits;
+    uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+    const int shift = 52-bits;
+    uint64_t expoIndex, mantiIndex;
+    uint16_t* tables[range+1];
+    for(int i=0; i<=range; i++){
+        tables[i] = levelTable.subTables[i].table;
+    }
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	double curData;
+	int state;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+
+	curData = spaceFillingValue[1];
+	predRelErrRatio = curData / pred1D;
+
+	expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+	if(expoIndex <= range){
+		mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+		state = tables[expoIndex][mantiIndex];
+	}else{
+		state = 0;
+	}
+
+	if (state)
+	{
+		type[1] = state;
+		P1[1] = fabs(pred1D) * precisionTable[state];
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = P1[j-1] * P1[j-1] / P1[j-2];
+		curData = spaceFillingValue[j];
+		predRelErrRatio = curData / pred1D;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if (state)
+		{
+			type[j] = state;
+			P1[j] = fabs(pred1D) * precisionTable[state];
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		predRelErrRatio = curData / pred1D;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if (state)
+		{
+			type[index] = state;
+			P0[0] = fabs(pred1D) * precisionTable[state];
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] * P1[j] / P1[j-1];
+
+			curData = spaceFillingValue[index];
+			predRelErrRatio = curData / pred2D;
+
+			expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+			if(expoIndex <= range){
+				mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+				state = tables[expoIndex][mantiIndex];
+			}else{
+				state = 0;
+			}
+
+			if (state)
+			{
+				type[index] = state;
+				P0[j] = fabs(pred2D) * precisionTable[state];
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	tdps->plus_bits = confparams_cpr->plus_bits;
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);
+	return tdps;	
+}
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif		
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_opt_MSST19(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+    double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+    double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+    for(int i=0; i<quantization_intervals; i++){
+        double test = pow((1+realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+    //double smallest_precision = precisionTable[0], largest_precision = precisionTable[quantization_intervals-1];
+    struct TopLevelTableWideInterval levelTable;
+    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+    size_t i,j,k;
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	//double diff = 0.0;
+	//double itvNum = 0;
+	double *P0, *P1;
+    double predRelErrRatio;
+
+	size_t dataLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	P0 = (double*)malloc(r23*sizeof(double));
+	P1 = (double*)malloc(r23*sizeof(double));
+
+	double medianValue = medianValue_f;
+	//double medianValueInverse = 1/ medianValue_f;
+	//short radExpo = getExponent_double(valueRangeSize/2);
+	reqLength = computeReqLength_double_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+    const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+    const uint64_t range = top - base;
+    const int bits = levelTable.bits;
+    uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+    const int shift = 52-bits;
+    uint64_t expoIndex, mantiIndex;
+    uint16_t* tables[range+1];
+    for(int i=0; i<=range; i++){
+        tables[i] = levelTable.subTables[i].table;
+    }
+    int state;
+
+    double temp, temp2;
+
+
+    //size_t miss=0, hit=0;
+
+    ///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	double curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+    predRelErrRatio = curData / pred1D;
+
+    expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+    if(expoIndex <= range){
+        mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+        state = tables[expoIndex][mantiIndex];
+    }else{
+        state = 0;
+    }
+
+	if (state)
+	{
+		type[1] = state;
+		P1[1] = fabs(pred1D) * precisionTable[state];
+		//hit++;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+		//miss++;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		temp = P1[j-1];
+		pred1D = temp * temp / P1[j-2];
+		curData = spaceFillingValue[j];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+        if (state)
+		{
+			type[j] = state;
+			P1[j] = fabs(pred1D) * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);;
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = spaceFillingValue[index];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+		if (state)
+		{
+			type[index] = state;
+			P1[index] = pred1D * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			temp = P1[index-1];
+			pred2D = temp * P1[index-r3] / P1[index-r3-1];
+			//double a = P1[index-1];
+			//double b = P1[index-r3];
+			//double c = P1[index-r3-1];
+
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				P1[index] = fabs(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+		if (state)
+		{
+			type[index] = state;
+			P0[0] = fabs(pred1D) * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			temp = P0[j-1];
+			pred2D = temp * P1[j] / P1[j-1];
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				P0[j] = fabs(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;
+			temp = P0[index2D-r3];
+			pred2D = temp * P1[index2D] / P1[index2D-r3];
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				P0[index2D] = fabs(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index ++;
+				index2D = i*r3 + j;
+				//pred3D = P0[index2D-1] * P0[index2D-r3] * P1[index2D] / P0[index2D-r3-1] / P1[index2D-r3] / P1[index2D-1] * P1[index2D-r3-1];
+				temp = P0[index2D-1];
+				temp2 = P0[index2D-r3-1];
+                pred3D = temp * P0[index2D-r3] * P1[index2D] * P1[index2D-r3-1] / (temp2 * P1[index2D-r3] * P1[index2D-1]);
+
+				curData = spaceFillingValue[index];
+                predRelErrRatio = curData / pred3D;
+
+                expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+                if(expoIndex <= range){
+                    mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                    state = tables[expoIndex][mantiIndex];
+                }else{
+                    state = 0;
+                }
+
+				if (state)
+				{
+					type[index] = state;
+					P0[index2D] = fabs(pred3D) * precisionTable[state];
+					//hit++;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+					//miss++;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	tdps->plus_bits = confparams_cpr->plus_bits;
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);	
+	return tdps;	
+}
+void SZ_compress_args_double_withinRange(unsigned char** newByteData, double *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageD* tdps = (TightDataPointStorageD*) malloc(sizeof(TightDataPointStorageD));
+	tdps->rtypeArray = NULL;
+	tdps->typeArray = NULL;
+	tdps->leadNumArray = NULL;
+	tdps->residualMidBits = NULL;
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->pwrErrBoundBytes = NULL;
+	tdps->isLossless = 0;
+	double value = oriData[0];
+	doubleToBytes(tdps->exactMidBytes, value);
+	tdps->exactMidBytes_size = 8;
+	
+	size_t tmpOutSize;
+	//unsigned char *tmpByteData;
+	convertTDPStoFlatBytes_double(tdps, newByteData, &tmpOutSize);
+	//convertTDPStoFlatBytes_double(tdps, &tmpByteData, &tmpOutSize);
+
+	//*newByteData = (unsigned char*)malloc(sizeof(unsigned char)*16); //for floating-point data (1+3+4+4)
+	//memcpy(*newByteData, tmpByteData, 16);
+	*outSize = tmpOutSize;//12==3+1+8(double_size)+MetaDataByteLength_double
+	free_TightDataPointStorageD(tdps);	
+}
+
+/*int SZ_compress_args_double_wRngeNoGzip(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	double valueRangeSize = 0, medianValue = 0;
+	
+	double min = computeRangeSize_double(oriData, dataLength, &valueRangeSize, &medianValue);
+	double max = min+valueRangeSize;
+	double realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_double_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, pwrErrRatio, r1, outSize, min, max);
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);				
+			}
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(newByteData, oriData, realPrecision, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r4*r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+	}
+	return status;
+}*/
+
+int SZ_compress_args_double(int cmprType, int withRegression, unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	if(errBoundMode==PW_REL)
+	{
+		confparams_cpr->pw_relBoundRatio = pwRelBoundRatio;	
+	}
+		
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataLength <= MIN_NUM_OF_ELEMENTS)
+	{
+		*newByteData = SZ_skip_compress_double(oriData, dataLength, outSize);
+		return status;
+	}
+	
+	double valueRangeSize = 0, medianValue = 0;
+	
+	unsigned char * signs = NULL;
+	bool positive = true;
+	double nearZero = 0.0;
+	double min = 0;
+	if(pwRelBoundRatio < 0.000009999)
+		confparams_cpr->accelerate_pw_rel_compression = 0;
+		
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression == 1)
+	{
+		signs = (unsigned char *) malloc(dataLength);
+		memset(signs, 0, dataLength);
+		min = computeRangeSize_double_MSST19(oriData, dataLength, &valueRangeSize, &medianValue, signs, &positive, &nearZero);
+	}
+	else
+		min = computeRangeSize_double(oriData, dataLength, &valueRangeSize, &medianValue);	
+	double max = min+valueRangeSize;
+	confparams_cpr->dmin = min;
+	confparams_cpr->dmax = max;
+	
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, valueRangeSize);
+	}
+	else if(confparams_cpr->errorBoundMode==NORM) //norm error = sqrt(sum((xi-xi_)^2))
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromNORM_ERR(confparams_cpr->normErr, dataLength);
+		//printf("realPrecision=%lf\n", realPrecision);				
+	}	
+	else
+	{
+		realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		confparams_cpr->absErrBound = realPrecision;
+	}	
+	if(valueRangeSize <= realPrecision)
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL && confparams_cpr->accelerate_pw_rel_compression == 1)
+			free(signs);		
+		SZ_compress_args_double_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, valueRangeSize, medianValue, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, min, max);
+					//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, valueRangeSize, medianValue, &tmpOutSize);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_1D(cmprType, &tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					{
+						SZ_compress_args_double_NoCkRngeNoGzip_1D(cmprType, &tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+						if(tmpOutSize>=dataLength*sizeof(double) + 3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1)
+							SZ_compress_args_double_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);
+					}
+		}
+		else
+		if (r3==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, min, max);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_2D(cmprType, &tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+				{	
+					if(withRegression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_2D(cmprType, &tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+					{
+						tmpByteData = SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize);
+						if(tmpOutSize>=dataLength*sizeof(double) + 3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1)
+							SZ_compress_args_double_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);
+					}
+				}
+		}
+		else
+		if (r4==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, min, max);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_3D(cmprType, &tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+				{
+					if(withRegression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_3D(cmprType, &tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+					{
+						tmpByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize);
+						if(tmpOutSize>=dataLength*sizeof(double) + 3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1)
+							SZ_compress_args_double_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);
+					}
+				}
+					
+					
+		}
+		else
+		if (r5==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, min, max);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif	
+				{
+					if(withRegression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+					{
+						tmpByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r4*r3, r2, r1, realPrecision, &tmpOutSize);								
+						if(tmpOutSize>=dataLength*sizeof(double) + 3 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 1)
+							SZ_compress_args_double_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);
+					}
+				}
+		
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR;
+		}
+				
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;			
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION || confparams_cpr->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+			status = SZ_MERR;	
+		}
+	}
+
+	return status;
+}
+
+//TODO
+int SZ_compress_args_double_subblock(unsigned char* compressedBytes, double *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	double valueRangeSize = 0, medianValue = 0;
+	computeRangeSize_double_subblock(oriData, &valueRangeSize, &medianValue, r5, r4, r3, r2, r1, s5, s4, s3, s2, s1, e5, e4, e3, e2, e1);
+
+	double realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		//TODO
+		//SZ_compress_args_double_withinRange_subblock();
+	}
+	else
+	{
+		if (r2==0)
+		{
+			//TODO
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_1D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r1, s1, e1);
+		}
+		else
+		if (r3==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_2D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r2, r1, s2, s1, e2, e1);
+		}
+		else
+		if (r4==0)
+		{
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_3D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+		}
+		else
+		if (r5==0)
+		{
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_4D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_4D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+	}
+	return status;
+}
+
+void SZ_compress_args_double_NoCkRnge_1D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r1, s1, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_2D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_2D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r2, r1, s2, s1, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_3D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_3D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_4D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_4D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+
+unsigned int optimize_intervals_double_1D_subblock(double *oriData, double realPrecision, size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	oriData = oriData + s1;
+
+	size_t i = 0;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = 2*oriData[i-1] - oriData[i-2];
+			//pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+
+	size_t i,j, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+
+	size_t r23 = r2*r3;
+
+	size_t i,j,k, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23]
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_4D_subblock(double *oriData, double realPrecision,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+
+	size_t i,j,k,l, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3*R4/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				for(l=s4+1;l<=e4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r4] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_subblock(oriData, realPrecision, r1, s1, e1);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);
+	int intvRadius = quantization_intervals/2;
+
+	size_t i; 
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData + s1; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	//add the first data
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+
+	int state;
+	double checkRadius;
+	double curData;
+	double pred;
+	double predAbsErr;
+	checkRadius = (quantization_intervals-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{
+		//printf("%.30G\n",last3CmprsData[0]);
+		curData = spaceFillingValue[i];
+		pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		//pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_double(last3CmprsData, pred);
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_double(last3CmprsData, vce->data);
+	}//end of for
+
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_subblock(oriData, realPrecision, r1, r2, s1, s2, e1, e2);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j; 
+	int reqLength;
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t dataLength = R1*R2;
+
+	P0 = (double*)malloc(R2*sizeof(double));
+	memset(P0, 0, R2*sizeof(double));
+	P1 = (double*)malloc(R2*sizeof(double));
+	memset(P1, 0, R2*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	/* Process Row-s1 data s2*/
+	size_t gIndex;
+	size_t lIndex;
+
+	gIndex = s1*r2+s2;
+	lIndex = 0;
+
+	type[lIndex] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-s1 data s2+1*/
+	gIndex = s1*r2+(s2+1);
+	lIndex = 1;
+
+	pred1D = P1[0];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + intvRadius;
+		P1[1] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-s1 data s2+2 --> data e2 */
+	for (j = 2; j < R2; j++)
+	{
+		gIndex = s1*r2+(s2+j);
+		lIndex = j;
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-s1+1 --> Row-e1 */
+	for (i = 1; i < R1; i++)
+	{
+		/* Process row-s1+i data s2 */
+		gIndex = (s1+i)*r2+s2;
+		lIndex = i*R2;
+
+		pred1D = P1[0];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+		/* Process row-s1+i data s2+1 --> e2 */
+		for (j = 1; j < R2; j++)
+		{
+			gIndex = (s1+i)*r2+(s2+j);
+			lIndex = i*R2+j;
+
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_subblock(oriData, realPrecision, r1, r2, r3, s1, s2, s3, e1, e2, e3);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t dataLength = R1*R2*R3;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+
+	P0 = (double*)malloc(R23*sizeof(double));
+	P1 = (double*)malloc(R23*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-s1 ///////////////////////////
+	/* Process Row-s2 data s3*/
+	size_t gIndex; 	//global index
+	size_t lIndex; 	//local index
+	size_t index2D; 	//local 2D index
+
+	gIndex = s1*r23+s2*r3+s3;
+	lIndex = 0;
+	index2D = 0;
+
+	type[lIndex] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[index2D] = vce->data;
+
+	/* Process Row-s2 data s3+1*/
+	gIndex = s1*r23+s2*r3+s3+1;
+	lIndex = 1;
+	index2D = 1;
+
+	pred1D = P1[index2D-1];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + intvRadius;
+		P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+	}
+
+    /* Process Row-s2 data s3+2 --> data e3 */
+	for (j = 2; j < R3; j++)
+	{
+		gIndex = s1*r23+s2*r3+s3+j;
+		lIndex = j;
+		index2D = j;
+
+		pred1D = 2*P1[index2D-1] - P1[index2D-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+	}
+
+	/* Process Row-s2+1 --> Row-e2 */
+	for (i = 1; i < R2; i++)
+	{
+		/* Process row-s2+i data s3 */
+		gIndex = s1*r23+(s2+i)*r3+s3;
+		lIndex = i*R3;
+		index2D = i*R3;
+
+		pred1D  = P1[index2D-R3];
+		diff    = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum  = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process row-s2+i data s3+1 --> data e3*/
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = s1*r23+(s2+i)*r3+s3+j;
+			lIndex = i*R3+j;
+			index2D = i*R3+j;
+
+			pred2D  = P1[index2D-1] + P1[index2D-R3] - P1[index2D-R3-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-s1+1 --> layer-e1 ///////////////////////////
+
+	for (k = 1; k < R1; k++)
+	{
+		/* Process Row-s2 data s3*/
+		gIndex = (s1+k)*r23+s2*r3+s3;
+		lIndex = k*R23;
+		index2D = 0;
+
+		pred1D = P1[index2D];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P0[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[index2D] = vce->data;
+		}
+
+
+	    /* Process Row-s2 data s3+1 --> data e3 */
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = (s1+k)*r23+s2*r3+s3+j;
+			lIndex = k*R23+j;
+			index2D = j;
+
+			pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+		}
+
+	    /* Process Row-s2+1 --> Row-e2 */
+		for (i = 1; i < R2; i++)
+		{
+			/* Process Row-s2+i data s3 */
+			gIndex = (s1+k)*r23+(s2+i)*r3+s3;
+			lIndex = k*R23+i*R3;
+			index2D = i*R3;
+
+			pred2D = P0[index2D-R3] + P1[index2D] - P1[index2D-R3];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s2+i data s3+1 --> data e3 */
+			for (j = 1; j < R3; j++)
+			{
+				gIndex = (s1+k)*r23+(s2+i)*r3+s3+j;
+				lIndex = k*R23+i*R3+j;
+				index2D = i*R3+j;
+
+				pred3D = P0[index2D-1] + P0[index2D-R3]+ P1[index2D] - P0[index2D-R3-1] - P1[index2D-R3] - P1[index2D-1] + P1[index2D-R3-1];
+				diff = spaceFillingValue[gIndex] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred3D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_4D_subblock(oriData, realPrecision, r1, r2, r3, r4, s1, s2, s3, s4, e1, e2, e3, e4);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t dataLength = R1*R2*R3*R4;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t R34 = R3*R4;
+	size_t R234 = R2*R3*R4;
+
+	P0 = (double*)malloc(R34*sizeof(double));
+	P1 = (double*)malloc(R34*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	size_t l;
+	for (l = 0; l < R1; l++)
+	{
+
+		///////////////////////////	Process layer-s2 ///////////////////////////
+		/* Process Row-s3 data s4*/
+		size_t gIndex; 	//global index
+		size_t lIndex; 	//local index
+		size_t index2D; 	//local 2D index
+
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4;
+		lIndex = l*R234;
+		index2D = 0;
+
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-s3 data s4+1*/
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+1;
+		lIndex = l*R234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-s3 data s4+2 --> data e4 */
+		for (j = 2; j < R4; j++)
+		{
+			gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+j;
+			lIndex = l*R234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-s3+1 --> Row-e3 */
+		for (i = 1; i < R3; i++)
+		{
+			/* Process row-s2+i data s3 */
+			gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4;
+			lIndex = l*R234+i*R4;
+			index2D = i*R4;
+
+			pred1D  = P1[index2D-R4];
+			diff    = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum  = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-s3+i data s4+1 --> data e4*/
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4+j;
+				lIndex = l*R234+i*R4+j;
+				index2D = i*R4+j;
+
+				pred2D  = P1[index2D-1] + P1[index2D-R4] - P1[index2D-R4-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P1[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-s2+1 --> layer-e2 ///////////////////////////
+
+		for (k = 1; k < R2; k++)
+		{
+			/* Process Row-s3 data s4*/
+			gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4;
+			lIndex = l*R234+k*R34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+
+			/* Process Row-s3 data s4+1 --> data e4 */
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4+j;
+				lIndex = l*R234+k*R34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-s3+1 --> Row-e3 */
+			for (i = 1; i < R3; i++)
+			{
+				/* Process Row-s3+i data s4 */
+				gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4;
+				lIndex = l*R234+k*R34+i*R4;
+				index2D = i*R4;
+
+				pred2D = P0[index2D-R4] + P1[index2D] - P1[index2D-R4];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-s3+i data s4+1 --> data e4 */
+				for (j = 1; j < R4; j++)
+				{
+					gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4+j;
+					lIndex = l*R234+k*R34+i*R4+j;
+					index2D = i*R4+j;
+
+//					printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+					pred3D = P0[index2D-1] + P0[index2D-R4]+ P1[index2D] - P0[index2D-R4-1] - P1[index2D-R4] - P1[index2D-1] + P1[index2D-R4-1];
+					diff = spaceFillingValue[gIndex] - pred3D;
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < quantization_intervals)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[lIndex] = (int) (itvNum/2) + intvRadius;
+						P0[index2D] = pred3D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[lIndex] = 0;
+						compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,8);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			double *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0;
+	double pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	double * data_pos = oriData + 2;
+	double divider = log2(1+realPrecision)*2;
+	int tempIndex = 0;
+	while(data_pos - oriData < dataLength){
+		if(*data_pos == 0){
+        		data_pos += confparams_cpr->sampleDistance;
+        		continue;
+		}			
+		tempIndex++;
+		totalSampleSize++;
+		pred_value = data_pos[-1];
+		pred_err = fabs((double)*data_pos / pred_value);
+		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<64)
+		powerOf2 = 64;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D_opt_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r2 + offset_count;
+	double divider = log2(1+realPrecision)*2;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		if(*data_pos == 0){
+        		data_pos += confparams_cpr->sampleDistance;
+        		continue;
+		}			
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value / *data_pos);
+		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<64)
+		powerOf2 = 64;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D_opt_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r23 + r3 + offset_count;
+	double divider = log2(1+realPrecision)*2;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		if(*data_pos == 0){
+        		data_pos += confparams_cpr->sampleDistance;
+        		continue;
+		}	
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(*data_pos / pred_value);
+		radiusIndex = fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<64)
+		powerOf2 = 64;
+	free(intervals);
+	return powerOf2;
+}
+unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision){	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	free(intervals);
+	return powerOf2;
+}
+
+size_t SZ_compress_double_3D_MDQ_RA_block(double * block_ori_data, double * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, double * P0, double * P1, int * type, double * unpredictable_data)
+{
+	double recip_realPrecision = 1/realPrecision;
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	double * cur_data_pos = block_ori_data;
+	double curData;
+	double pred1D, pred2D, pred3D;
+	double itvNum;
+	double diff;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	// Process Row-0 data 0
+	pred1D = mean[0];
+	curData = *cur_data_pos;
+	diff = curData - pred1D;
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = cur_data_pos[1];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = cur_data_pos[j];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;
+				P1[index] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P1[index] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;
+					P1[index] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P1[index] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+
+		cur_data_pos += dim1_offset;
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = *cur_data_pos;
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[index2D] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[index2D] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = cur_data_pos[j];
+				diff = curData - pred3D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabs(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;
+						P0[index2D] = curData;	
+						unpredictable_data[unpredictable_count ++] = curData;
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					P0[index2D] = curData;
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	return unpredictable_count;
+}
+
+unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r2 + offset_count;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	double * data_pos = oriData + 2;
+	while(data_pos - oriData < dataLength){
+		totalSampleSize++;
+		pred_value = data_pos[-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+/*The above code is for sz 1.4.13; the following code is for sz 2.0*/
+unsigned int optimize_intervals_double_2D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq)
+{	
+	double mean = 0.0;
+	size_t len = r1 * r2;
+	size_t mean_distance = (int) (sqrt(len));
+
+	double * data_pos = oriData;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	double predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	double mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t n1_count = 1;
+	size_t offset_count = sampleDistance - 1;
+	size_t offset_count_2 = 0;
+	size_t sample_count = 0;
+	data_pos = oriData + r2 + offset_count;
+	while(data_pos - oriData < len){
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=maxRangeRadius)
+			radiusIndex = maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % sampleDistance;
+			data_pos += (r2 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+#define MIN(a, b) a<b? a : b
+unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+
+	double recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	double sz_sample_correct_freq = -1;//0.5; //-1
+	double dense_pos;
+	double mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	// calculate block dims
+	size_t num_x, num_y;
+	size_t block_size = 16;
+
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y;
+	size_t num_blocks = num_x * num_y;
+	size_t num_elements = r1 * r2;
+
+	size_t dim0_offset = r2;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	double * data_pos = oriData;
+	int * type = result_type;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+
+	double * reg_params = (double *) malloc(num_blocks * 4 * sizeof(double));
+	double * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+
+			data_pos = oriData + offset_x * dim0_offset + offset_y;
+
+			{
+				double * cur_data_pos = data_pos;
+				double fx = 0.0;
+				double fy = 0.0;
+				double f = 0;
+				double sum_x; 
+				double curData;
+				for(size_t i=0; i<current_blockcount_x; i++){
+					sum_x = 0;
+					for(size_t j=0; j<current_blockcount_y; j++){
+						curData = *cur_data_pos;
+						sum_x += curData;
+						fy += curData * j;
+						cur_data_pos ++;
+					}
+					fx += sum_x * i;
+					f += sum_x;
+					cur_data_pos += dim0_offset - current_blockcount_y;
+				}
+				double coeff = 1.0 / (current_blockcount_x * current_blockcount_y);
+				reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+				reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+				reg_params_pos[params_offset_c] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2);
+			}
+
+			reg_params_pos ++;
+		}
+	}
+
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c;
+	double rel_param_err = 0.15/3;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision;
+
+	double mean = 0;
+	use_mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	// use two prediction buffers for higher performance
+	double * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim0_offset = strip_dim_1;
+	unsigned char * indicator_pos = indicator;
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(double);
+	double * prediction_buffer_1 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	double * prediction_buffer_2 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	double * cur_pb_buf = prediction_buffer_1;
+	double * next_pb_buf = prediction_buffer_2;
+	double * cur_pb_buf_pos;
+	double * next_pb_buf_pos;
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;
+	int use_reg = 0;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	double last_coeffcients[3] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	double * coeff_unpred_data[3];
+	double * coeff_unpredictable_data = (double *) malloc(num_blocks*3*sizeof(double));
+	double precision[3], recip_precision[3];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c;
+	for(int i=0; i<3; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[3] = {0};
+	double noise = realPrecision * 0.81;
+	if(use_mean){
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			double * pb_pos = cur_pb_buf_pos;
+			double * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				
+				/*sampling: decide which predictor to use (regression or lorenzo)*/
+				{
+					double * cur_data_pos;
+					double curData;
+					double pred_reg, pred_sz;
+					double err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int bmi = 0;
+					int block_size = MIN(current_blockcount_x, current_blockcount_y);
+					for(int i=1; i<block_size; i++){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+						err_reg += fabs(pred_reg - curData);
+
+						bmi = block_size - i;
+						cur_data_pos = data_pos + i*dim0_offset + bmi;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+						err_reg += fabs(pred_reg - curData);								
+					}
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						double cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)*recip_precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					double curData;
+					double pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					double * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					double * cur_pb_pos = pb_pos;
+					double * cur_data_pos = data_pos;
+					double curData;
+					double pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)*recip_realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)*recip_realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}// end use mean
+	else{
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			double * pb_pos = cur_pb_buf_pos;
+			double * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				/*sampling*/
+				{
+					// sample [2i + 1, 2i + 1] [2i + 1, bs - 2i]
+					double * cur_data_pos;
+					double curData;
+					double pred_reg, pred_sz;
+					double err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int bmi = 0;
+					int block_size = MIN(current_blockcount_x, current_blockcount_y);
+					for(int i=1; i<block_size; i++){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];							
+						err_sz += fabs(pred_sz - curData) + noise;
+						err_reg += fabs(pred_reg - curData);
+
+						bmi = block_size - i;
+						cur_data_pos = data_pos + i*dim0_offset + bmi;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += fabs(pred_sz - curData) + noise;
+						err_reg += fabs(pred_reg - curData);								
+					}
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						double cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)*recip_precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					double curData;
+					double pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					double * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					double * cur_pb_pos = pb_pos;
+					double * cur_data_pos = data_pos;
+					double curData;
+					double pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i		
+	}
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	size_t i = 0;
+	init(huffmanTree, result_type, num_elements);
+	for (i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength_double;
+	// total size 										metadata		  # elements   real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize + 3*num_blocks*sizeof(int) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(double) + total_unpred * sizeof(double) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	sizeToBytes(result_pos, num_elements);
+	result_pos += exe_params->SZ_SIZE_TYPE;
+	
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(double));
+	result_pos += sizeof(double);
+
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream 	
+	if(reg_count>0){
+		for(int e=0; e<3; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(double));
+			result_pos += coeff_unpredictable_count[e]*sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(double));
+	result_pos += total_unpred * sizeof(double);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+#ifdef HAVE_WRITESTATS
+	writeHuffmanInfo(treeByteSize, typeArray_size, num_elements*sizeof(float), nodeCount);
+	writeBlockInfo(use_mean, block_size, reg_count, num_blocks);
+	writeUnpredictDataCounts(total_unpred, num_elements);
+#endif	
+
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+
+
+
+	return result;
+}
+unsigned int optimize_intervals_double_3D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq)
+{	
+	double mean = 0.0;
+	size_t len = r1 * r2 * r3;
+	size_t mean_distance = (int) (sqrt(len));
+	double * data_pos = oriData;
+	size_t offset_count = 0;
+	size_t offset_count_2 = 0;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+		offset_count += mean_distance;
+		offset_count_2 += mean_distance;
+		if(offset_count >= r3){
+			offset_count = 0;
+			data_pos -= 1;
+		}
+		if(offset_count_2 >= r2 * r3){
+			offset_count_2 = 0;
+			data_pos -= 1;
+		}
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	double predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	double mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t sample_count = 0;
+
+	offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+
+	while(data_pos - oriData < len){
+
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=maxRangeRadius)
+		{
+			radiusIndex = maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % sampleDistance;
+			data_pos += (r3 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}	
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+
+
+unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+#ifdef HAVE_TIMECMPR	
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif
+
+	double recip_realPrecision = 1/realPrecision;
+	//printf("recip_realPrecision = %.20G\n", recip_realPrecision);
+
+	unsigned int quantization_intervals;
+	double sz_sample_correct_freq = -1;//0.5; //-1
+	double dense_pos;
+	double mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	memset(result_type, 0, num_elements*sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	size_t max_unpred_count = 0;
+	double * data_pos = oriData;
+	int * type = result_type;
+	size_t type_offset;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+
+	double * reg_params = (double *) malloc(num_blocks * 4 * sizeof(double));
+	double * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+	
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+				/*Calculate regression coefficients*/
+				{
+					double * cur_data_pos = data_pos;
+					double fx = 0.0;
+					double fy = 0.0;
+					double fz = 0.0;
+					double f = 0;
+					double sum_x, sum_y; 
+					double curData;
+					for(size_t i=0; i<current_blockcount_x; i++){
+						sum_x = 0;
+						for(size_t j=0; j<current_blockcount_y; j++){
+							sum_y = 0;
+							for(size_t k=0; k<current_blockcount_z; k++){
+								curData = *cur_data_pos;
+								// f += curData;
+								// fx += curData * i;
+								// fy += curData * j;
+								// fz += curData * k;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+							cur_data_pos += dim1_offset - current_blockcount_z;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+						cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+					}
+					double coeff = 1.0 / (current_blockcount_x * current_blockcount_y * current_blockcount_z);
+					reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (current_blockcount_z - 1) - f) * 6 * coeff / (current_blockcount_z + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2 + (current_blockcount_z - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	double rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision / late_blockcount_z;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	double mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	// use two prediction buffers for higher performance
+	double * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim_2 = r3 + 1;
+	size_t strip_dim0_offset = strip_dim_1 * strip_dim_2;
+	size_t strip_dim1_offset = strip_dim_2;
+	unsigned char * indicator_pos = indicator;
+
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(double);
+	double * prediction_buffer_1 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	double * prediction_buffer_2 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	double * cur_pb_buf = prediction_buffer_1;
+	double * next_pb_buf = prediction_buffer_2;
+	double * cur_pb_buf_pos;
+	double * next_pb_buf_pos;
+	int intvCapacity = quantization_intervals;// exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	int use_reg = 0;
+	double noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	double last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	double * coeff_unpred_data[4];
+	double * coeff_unpredictable_data = (double *) malloc(num_blocks*4*sizeof(double));
+	double precision[4], recip_precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c, recip_precision[3] = 1/precision_d;
+	
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				double * pb_pos = cur_pb_buf_pos;
+				double * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+#ifdef HAVE_TIMECMPR
+					size_t offset_z = 0;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif
+
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						double * cur_data_pos;
+						double curData;
+						double pred_reg, pred_sz;
+						double err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						int block_size = MIN(current_blockcount_x, (MIN(current_blockcount_y, current_blockcount_z)));
+						for(int i=1; i<block_size; i++){
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+						}
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							double cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)*recip_precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						double curData;
+						double pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						double * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif									
+									
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif		
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						double * cur_pb_pos = pb_pos;
+						double * cur_data_pos = data_pos;
+						double curData;
+						double pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)*recip_realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif											
+									
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)*recip_realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif										
+									
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				} // end k
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}// end j
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				// copy bottom plane from plane buffer
+				// memcpy(prediction_buffer, bottom_buffer + offset_y * strip_dim1_offset, (current_blockcount_y + 1) * strip_dim1_offset * sizeof(double));
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				double * pb_pos = cur_pb_buf_pos;
+				double * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+					size_t offset_z = 0;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif							
+					
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						double * cur_data_pos;
+						double curData;
+						double pred_reg, pred_sz;
+						double err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						int block_size = MIN(current_blockcount_x, (MIN(current_blockcount_y, current_blockcount_z)));
+						for(int i=1; i<block_size; i++){
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+						}
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							double cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)*recip_precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						double curData;
+						double pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						double * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif			
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						double * cur_pb_pos = pb_pos;
+						double * cur_data_pos = data_pos;
+						double curData;
+						double pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif	
+									
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif											
+									
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				}
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}
+	}
+
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength_double;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize + 4*num_blocks*sizeof(int)+ num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(double) + total_unpred * sizeof(double) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(double));
+	result_pos += sizeof(double);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(double));
+			result_pos += coeff_unpredictable_count[e]*sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(double));
+	result_pos += total_unpred * sizeof(double);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+#ifdef HAVE_WRITESTATS
+	writeHuffmanInfo(treeByteSize, typeArray_size, num_elements*sizeof(float), nodeCount);
+	writeBlockInfo(use_mean, block_size, reg_count, num_blocks);
+	writeUnpredictDataCounts(total_unpred, num_elements);
+#endif	
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
diff --git a/deps/SZ/sz/src/sz_double_pwr.c b/deps/SZ/sz/src/sz_double_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc037db7fbb2280df74c77aa324e91d42259d4d3
--- /dev/null
+++ b/deps/SZ/sz/src/sz_double_pwr.c
@@ -0,0 +1,2067 @@
+/**
+ *  @file sz_double_pwr.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ * This file contains the compression/decompression functions related to point-wise relative errors
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "sz_double.h"
+#include "sz_double_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "utility.h"
+
+void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0;
+	double realPrecision = oriData[0]!=0?fabs(confparams_cpr->pw_relBoundRatio*oriData[0]):confparams_cpr->pw_relBoundRatio; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curPrecision;
+	double curValue;
+	double sum = 0;
+	for(i=0;i<dataLength;i++)
+	{
+		curValue = oriData[i];
+		if(i%confparams_cpr->segment_size==0&&i>0)
+		{
+			//get two first bytes of the realPrecision
+			if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+			{
+				realPrecision = sum/confparams_cpr->segment_size;
+				sum = 0;			
+			}
+			realPrecision *= confparams_cpr->pw_relBoundRatio;
+			if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+				realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+			else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+				realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+				
+			doubleToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 6);
+			approxPrecision = bytesToDouble(realPrecBytes);
+			//put the realPrecision in double* pwrErBound
+			pwrErrBound[j++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			pwrErrBoundBytes[k++] = realPrecBytes[0];
+			pwrErrBoundBytes[k++] = realPrecBytes[1];
+			
+			realPrecision = fabs(curValue);
+		}
+		
+		if(curValue!=0)
+		{
+			curPrecision = fabs(curValue);
+			
+			switch(confparams_cpr->pwr_type)
+			{
+			case SZ_PWR_MIN_TYPE: 
+				if(realPrecision>curPrecision)
+					realPrecision = curPrecision;	
+				break;
+			case SZ_PWR_AVG_TYPE:
+				sum += curPrecision;
+				break;
+			case SZ_PWR_MAX_TYPE:
+				if(realPrecision<curPrecision)
+					realPrecision = curPrecision;					
+				break;
+			}
+		}
+	}
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int size = dataLength%confparams_cpr->segment_size==0?confparams_cpr->segment_size:dataLength%confparams_cpr->segment_size;
+		realPrecision = sum/size;		
+	}	
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;	
+	doubleToBytes(realPrecBytes, realPrecision);
+	memset(&realPrecBytes[2], 0, 6);
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[j++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];
+}
+
+unsigned int optimize_intervals_double_1D_pwr(double *oriData, size_t dataLength, double* pwrErrBound)
+{	
+	size_t i = 0, j = 0;
+	double realPrecision = pwrErrBound[j++];	
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	int totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->segment_size==0)
+			realPrecision = pwrErrBound[j++];
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_double_2D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, index = 0, J = 0; //I=-1,J=-1 if they are needed
+	double realPrecision; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curValue, curAbsValue;
+	double* statAbsValues = (double*)malloc(R2*sizeof(double));	
+	
+	double max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.
+	double min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+	{
+		if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+			statAbsValues[i] = max;
+		else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+			statAbsValues[i] = min;
+		else
+			statAbsValues[i] = 0; //for SZ_PWR_AVG_TYPE
+	}
+	for(i=0;i<r1;i++)
+	{
+		for(j=0;j<r2;j++)
+		{
+			index = i*r2+j;
+			curValue = oriData[index];				
+			if(((i%edgeSize==edgeSize-1 || i==r1-1) &&j%edgeSize==0&&j>0) || (i%edgeSize==0&&j==0&&i>0))
+			{
+				if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+				{
+					int a = edgeSize, b = edgeSize;
+					if(j==0)
+					{
+						if(r2%edgeSize==0) 
+							b = edgeSize;
+						else
+							b = r2%edgeSize;
+					}
+					if(i==r1-1)
+					{
+						if(r1%edgeSize==0)
+							a = edgeSize;
+						else
+							a = r1%edgeSize;
+					}
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+				}
+				else
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];
+
+				if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+					realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+				else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+					realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+					
+				doubleToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 6);
+				approxPrecision = bytesToDouble(realPrecBytes);
+				//put the realPrecision in double* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				pwrErrBoundBytes[k++] = realPrecBytes[0];
+				pwrErrBoundBytes[k++] = realPrecBytes[1];	
+				
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J] = min;
+				else
+					statAbsValues[J] = 0; //for SZ_PWR_AVG_TYPE		
+			}	
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;			
+			if(curValue!=0)
+			{
+				curAbsValue = fabs(curValue);
+				
+				switch(confparams_cpr->pwr_type)
+				{
+				case SZ_PWR_MIN_TYPE: 
+					if(statAbsValues[J]>curAbsValue)
+						statAbsValues[J] = curAbsValue;	
+					break;
+				case SZ_PWR_AVG_TYPE:
+					statAbsValues[J] += curAbsValue;
+					break;
+				case SZ_PWR_MAX_TYPE:
+					if(statAbsValues[J]<curAbsValue)
+						statAbsValues[J] = curAbsValue;					
+					break;
+				}
+			}
+		}
+	}
+		
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int a = edgeSize, b = edgeSize;
+		if(r2%edgeSize==0) 
+			b = edgeSize;
+		else
+			b = r2%edgeSize;
+		if(r1%edgeSize==0)
+			a = edgeSize;
+		else
+			a = r1%edgeSize;
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+	}
+	else
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];		
+
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+		
+	doubleToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];	
+	
+	free(statAbsValues);
+}
+
+unsigned int optimize_intervals_double_2D_pwr(double *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, double* pwrErrBound)
+{	
+	size_t i = 0,j = 0, index, I=0, J=0;
+	double realPrecision = pwrErrBound[0];	
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	size_t ir2;
+	for(i=1;i<r1;i++)
+	{
+		ir2 = i*r2;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			index = ir2+j;
+			if(j%edgeSize==0)
+				J++;
+				
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				realPrecision = pwrErrBound[I*R2+J];
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_double_3D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, q = 0, index = 0, J = 0, K = 0; //I=-1,J=-1 if they are needed
+	size_t r23 = r2*r3, ir, jr;
+	double realPrecision; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curValue, curAbsValue;
+	
+	double** statAbsValues = create2DArray_double(R2, R3);
+	double max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.	
+	double min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+		for(j=0;j<R3;j++)
+		{
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[i][j] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[i][j] = min;
+			else
+				statAbsValues[i][j] = 0;
+		}
+	for(i=0;i<r1;i++)
+	{
+		ir = i*r23;		
+		if(i%edgeSize==0&&i>0)
+		{
+			realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+			doubleToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 6);
+			approxPrecision = bytesToDouble(realPrecBytes);
+			//put the realPrecision in double* pwrErBound
+			pwrErrBound[p++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+			pwrErrBoundBytes[q++] = realPrecBytes[0];
+			pwrErrBoundBytes[q++] = realPrecBytes[1];
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[J][K] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[J][K] = min;
+		}		
+		for(j=0;j<r2;j++)
+		{
+			jr = j*r3;
+			if((i%edgeSize==edgeSize-1 || i == r1-1)&&j%edgeSize==0&&j>0)
+			{
+				realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+				doubleToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 6);
+				approxPrecision = bytesToDouble(realPrecBytes);
+				//put the realPrecision in double* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+				pwrErrBoundBytes[q++] = realPrecBytes[0];
+				pwrErrBoundBytes[q++] = realPrecBytes[1];
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J][K] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J][K] = min;			
+			}
+			
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;					
+			
+			for(k=0;k<r3;k++)
+			{
+				index = ir+jr+k;				
+				curValue = oriData[index];				
+				if((i%edgeSize==edgeSize-1 || i == r1-1)&&(j%edgeSize==edgeSize-1||j==r2-1)&&k%edgeSize==0&&k>0)
+				{
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+					doubleToBytes(realPrecBytes, realPrecision);
+					memset(&realPrecBytes[2], 0, 6);
+					approxPrecision = bytesToDouble(realPrecBytes);
+					//put the realPrecision in double* pwrErBound
+					pwrErrBound[p++] = approxPrecision;
+					//put the two bytes in pwrErrBoundBytes
+					//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+					pwrErrBoundBytes[q++] = realPrecBytes[0];
+					pwrErrBoundBytes[q++] = realPrecBytes[1];
+					
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+						statAbsValues[J][K] = max;
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+						statAbsValues[J][K] = min;	
+				}	
+
+				if(k==0)
+					K = 0;
+				else if(k%edgeSize==0)
+					K++;
+					
+				if(curValue!=0)
+				{
+					curAbsValue = fabs(curValue);
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					{
+						if(statAbsValues[J][K]>curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					{
+						if(statAbsValues[J][K]<curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+				}
+			}			
+		}
+	}	
+	
+	realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+	doubleToBytes(realPrecBytes, realPrecision);
+	memset(&realPrecBytes[2], 0, 6);
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[q++] = realPrecBytes[0];
+	pwrErrBoundBytes[q++] = realPrecBytes[1];
+	
+	free2DArray_double(statAbsValues, R2);
+}
+
+unsigned int optimize_intervals_double_3D_pwr(double *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, double* pwrErrBound)
+{	
+	size_t i,j,k, ir,jr,index, I = 0,J=0,K=0;
+	double realPrecision = pwrErrBound[0];		
+	unsigned long radiusIndex;
+	size_t r23=r2*r3;
+	size_t R23 = R2*R3;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		ir = i*r23;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			jr = j*r3;
+			if(j%edgeSize==0)
+			{	
+				J++;
+				K = 0;
+			}			
+			for(k=1;k<r3;k++)
+			{
+				index = ir+jr+k;
+				if(k%edgeSize==0)
+					K++;		
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					realPrecision = pwrErrBound[I*R23+J*R2+K];					
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t dataLength, size_t *outSize, double min, double max)
+{
+	size_t pwrLength = dataLength%confparams_cpr->segment_size==0?dataLength/confparams_cpr->segment_size:dataLength/confparams_cpr->segment_size+1;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*pwrLength);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*pwrLength*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_double_1D(oriData, dataLength, pwrErrBound, pwrErrBoundBytes, globalPrecision);
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_1D_pwr(oriData, dataLength, pwrErrBound);	
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i = 0, j = 0;
+	int reqLength;
+	double realPrecision = pwrErrBound[j++];	
+	double medianValue = 0;
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_double(radius);
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8] = {0};
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+						
+	//add the first data	
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+		
+	//add the second data
+	type[1] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);			
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius;
+	double curData;
+	double pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	int updateReqLength = 0; //a marker: 1 means already updated
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		if(i%confparams_cpr->segment_size==0)
+		{
+			realPrecision = pwrErrBound[j++];
+			checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+			interval = 2*realPrecision;
+			updateReqLength = 0;
+		}
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_double(last3CmprsData, pred);			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		if(updateReqLength==0)
+		{
+			computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);				
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;		
+		}
+		
+		type[i] = 0;
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_double(last3CmprsData, vce->data);	
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+//	writeUShortData(type, dataLength, "compressStateBytes.sb");
+//	unsigned short type_[dataLength];
+//	SZ_Reset();
+//	decode_withTree(tdps->typeArray, tdps->typeArray_size, type_);	
+//	printf("tdps->typeArray_size=%d\n", tdps->typeArray_size);
+		
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+		
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	int doubleSize=sizeof(double);
+	if(*outSize>dataLength*doubleSize)
+	{
+		size_t k = 0, i;
+		tdps->isLossless = 1;
+		size_t totalByteLength = 3 + exe_params->SZ_SIZE_TYPE + 1 + doubleSize*dataLength;
+		*newByteData = (unsigned char*)malloc(totalByteLength);
+		
+		unsigned char dsLengthBytes[exe_params->SZ_SIZE_TYPE];
+		intToBytes_bigEndian(dsLengthBytes, dataLength);//4
+		for (i = 0; i < 3; i++)//3
+			(*newByteData)[k++] = versionNumber[i];
+		
+		if(exe_params->SZ_SIZE_TYPE==4)
+		{
+			(*newByteData)[k++] = 16;	//=00010000	
+		}
+		else 
+		{
+			(*newByteData)[k++] = 80;
+		}
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//4 or 8
+			(*newByteData)[k++] = dsLengthBytes[i];
+
+		
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+			memcpy((*newByteData)+4+exe_params->SZ_SIZE_TYPE, oriData, dataLength*doubleSize);
+		else
+		{
+			unsigned char* p = (*newByteData)+4+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=doubleSize)
+				doubleToBytes(p, oriData[i]);
+		}
+		*outSize = totalByteLength;
+	}
+	
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);
+	free(exactMidByteArray);
+}
+
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2,
+size_t *outSize, double min, double max)
+{
+	size_t dataLength=r1*r2;
+	int blockEdgeSize = computeBlockEdgeSize_2D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*R1*R2);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_double_2D(oriData, pwrErrBound, r1, r2, R2, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_pwr(oriData, r1, r2, R2, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;	
+	//printf("quantization_intervals=%d\n",quantization_intervals);
+	
+	size_t i=0,j=0,I=0,J=0; 
+	int reqLength;
+	double realPrecision = pwrErrBound[I*R2+J];	
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+	
+	P0 = (double*)malloc(r2*sizeof(double));
+	memset(P0, 0, r2*sizeof(double));
+	P1 = (double*)malloc(r2*sizeof(double));
+	memset(P1, 0, r2*sizeof(double));
+		
+	double medianValue = 0;
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_double(radius);
+	int updateReqLength = 1;
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[I*R2+J];
+			updateReqLength = 0;
+		}
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R2+J]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R2+J];
+				updateReqLength = 0;
+			}
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}
+
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+		
+	if(r2!=1)	
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+		
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);	
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max)
+{
+	size_t dataLength=r1*r2*r3;
+	
+	int blockEdgeSize = computeBlockEdgeSize_3D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	size_t R3 = 1+(r3-1)/blockEdgeSize;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*R1*R2*R3);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*R3*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);	
+	
+	compute_segment_precisions_double_3D(oriData, pwrErrBound, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_pwr(oriData, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i=0,j=0,k=0, I = 0, J = 0, K = 0;
+	int reqLength;
+	double realPrecision = pwrErrBound[0];		
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+	P0 = (double*)malloc(r23*sizeof(double));
+	P1 = (double*)malloc(r23*sizeof(double));
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	double medianValue = 0;
+	short radExpo = getExponent_double(radius);
+	int updateReqLength = 0;
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		if(updateReqLength==0)
+		{
+			computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;
+		}		
+		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[J];
+			updateReqLength = 0;
+		}		
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}			
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	K = 0;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R3+J]; //J==0
+		updateReqLength = 0;
+
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}		
+						
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++) //note that this j refers to fastest dimension (lowest order)
+		{
+			index = i*r3+j;		
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R3+J];
+				updateReqLength = 0;
+			}			
+		
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;			
+		I = 0;
+		J = 0;
+		if(k%blockEdgeSize==0)
+			K++;
+		realPrecision = pwrErrBound[K*R23]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}					
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			index = k*r23+j;	
+
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[K*R23+J];
+				updateReqLength = 0;			
+			}					
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+
+			J = 0;
+			if(i%blockEdgeSize==0)
+				I++;
+			realPrecision = pwrErrBound[K*R23+I*R3+J]; //J==0
+			updateReqLength = 0;			
+			
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index = k*r23 + i*r3 + j;
+				
+				if(j%blockEdgeSize==0)
+				{
+					J++;
+					realPrecision = pwrErrBound[K*R23+I*R3+J];
+					updateReqLength = 0;			
+				}							
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					if(updateReqLength==0)
+					{
+						computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;
+						updateReqLength = 1;
+					}							
+					
+					type[index] = 0;
+
+					addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);
+	free(exactMidByteArray);
+}
+
+void createRangeGroups_double(double** posGroups, double** negGroups, int** posFlags, int** negFlags)
+{
+	size_t size = GROUP_COUNT*sizeof(double);
+	size_t size2 = GROUP_COUNT*sizeof(int);
+	*posGroups = (double*)malloc(size);
+	*negGroups = (double*)malloc(size);
+	*posFlags = (int*)malloc(size2);
+	*negFlags = (int*)malloc(size2);
+	memset(*posGroups, 0, size);
+	memset(*negGroups, 0, size);
+	memset(*posFlags, 0, size2);
+	memset(*negFlags, 0, size2);
+}
+
+void compressGroupIDArray_double(char* groupID, TightDataPointStorageD* tdps)
+{
+	size_t dataLength = tdps->dataSeriesLength;
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+
+	size_t i;
+	standGroupID[0] = groupID[0]+GROUP_COUNT; //plus an offset such that it would not be a negative number.
+	char lastGroupIDValue = groupID[0], curGroupIDValue;
+	int offset = 2*(GROUP_COUNT + 2);
+	for(i=1; i<dataLength;i++)
+	{
+		curGroupIDValue = groupID[i];
+		standGroupID[i] = (curGroupIDValue - lastGroupIDValue) + offset; 
+		lastGroupIDValue = curGroupIDValue;
+	}
+	
+	unsigned char* out = NULL;
+	size_t outSize;
+	
+	HuffmanTree* huffmanTree = SZ_Reset();
+	encode_withTree(huffmanTree, standGroupID, dataLength, &out, &outSize);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	tdps->pwrErrBoundBytes = out; //groupIDArray
+	tdps->pwrErrBoundBytes_size = outSize;
+	
+	free(standGroupID);
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_pwrGroup(double* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f)
+{
+	size_t i;
+	double *posGroups, *negGroups, *groups;
+	double pos_01_group = 0, neg_01_group = 0; //[0,1] and [-1,0]
+	int *posFlags, *negFlags, *flags;
+	int pos_01_flag = 0, neg_01_flag = 0;
+	createRangeGroups_double(&posGroups, &negGroups, &posFlags, &negFlags);
+	size_t nbBins = (size_t)(1/pwrErrRatio);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	int reqLength, status;
+	double medianValue = medianValue_f;
+	double realPrecision = (double)getRealPrecision_double(valueRangeSize, errBoundMode, absErrBound, relBoundRatio, &status);
+	if(realPrecision<0)
+		realPrecision = pwrErrRatio;
+	double realGroupPrecision; //precision (error) based on group ID
+	getPrecisionReqLength_double(realPrecision);
+	short radExpo = getExponent_double(valueRangeSize/2);
+	short lastGroupNum = 0, groupNum, grpNum = 0;
+	
+	double* groupErrorBounds = generateGroupErrBounds(errBoundMode, realPrecision, pwrErrRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	char *groupID = (char*) malloc(dataLength*sizeof(char));
+	char *gp = groupID;
+		
+	double* spaceFillingValue = oriData; 
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[8];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	int state;
+	double curData, decValue;
+	double pred;
+	double predAbsErr;
+	double interval = 0;
+	
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	
+	curData = spaceFillingValue[0];
+	groupNum = computeGroupNum_double(vce->data);
+	
+	if(curData > 0 && groupNum >= 0)
+	{
+		groups = posGroups;
+		flags = posFlags;
+		grpNum = groupNum;
+	}
+	else if(curData < 0 && groupNum >= 0)
+	{
+		groups = negGroups;
+		flags = negFlags;
+		grpNum = groupNum;
+	}
+	else if(curData >= 0 && groupNum == -1)
+	{
+		groups = &pos_01_group;
+		flags = &pos_01_flag;
+		grpNum = 0;
+	}
+	else //curData < 0 && groupNum == -1
+	{
+		groups = &neg_01_group;
+		flags = &neg_01_flag;
+		grpNum = 0;
+	}
+		
+	listAdd_double_group(groups, flags, groupNum, spaceFillingValue[0], vce->data, gp);
+	gp++;
+	
+	for(i=1;i<dataLength;i++)
+	{
+		curData = oriData[i];
+		//printf("i=%d, posGroups[3]=%f, negGroups[3]=%f\n", i, posGroups[3], negGroups[3]);
+		
+		groupNum = computeGroupNum_double(curData);
+		
+		if(curData > 0 && groupNum >= 0)
+		{
+			groups = posGroups;
+			flags = posFlags;
+			grpNum = groupNum;
+		}
+		else if(curData < 0 && groupNum >= 0)
+		{
+			groups = negGroups;
+			flags = negFlags;
+			grpNum = groupNum;
+		}
+		else if(curData >= 0 && groupNum == -1)
+		{
+			groups = &pos_01_group;
+			flags = &pos_01_flag;
+			grpNum = 0;
+		}
+		else //curData < 0 && groupNum == -1
+		{
+			groups = &neg_01_group;
+			flags = &neg_01_flag;
+			grpNum = 0;
+		}
+
+		if(groupNum>=GROUP_COUNT)
+		{
+			type[i] = 0;
+			compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			listAdd_double_group(groups, flags, lastGroupNum, curData, vce->data, gp);	//set the group number to be last one in order to get the groupID array as smooth as possible.		
+		}
+		else if(flags[grpNum]==0) //the dec value may not be in the same group
+		{	
+			type[i] = 0;
+			compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			//decGroupNum = computeGroupNum_double(vce->data);
+			
+			//if(decGroupNum < groupNum)
+			//	decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//else if(decGroupNum > groupNum)
+			//	decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//else
+			//	decValue = vce->data;
+			
+			decValue = vce->data;	
+			listAdd_double_group(groups, flags, groupNum, curData, decValue, gp);
+			lastGroupNum = curData>0?groupNum + 2: -(groupNum+2);
+		}
+		else //if flags[groupNum]==1, the dec value must be in the same group
+		{
+			pred = groups[grpNum];
+			predAbsErr = fabs(curData - pred);
+			realGroupPrecision = groupErrorBounds[grpNum]; //compute real error bound
+			interval = realGroupPrecision*2;
+			state = (predAbsErr/realGroupPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				decValue = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				decValue = pred - state*interval;
+			}
+			//decGroupNum = computeGroupNum_double(pred);
+			
+			if((decValue>0&&curData<0)||(decValue<0&&curData>=0))
+				decValue = 0;
+			//else
+			//{
+			//	if(decGroupNum < groupNum)
+			//		decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//	else if(decGroupNum > groupNum)
+			//		decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//	else
+			//		decValue = pred;				
+			//}
+			
+			if(fabs(curData-decValue)>realGroupPrecision)
+			{	
+				type[i] = 0;
+				compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+				decValue = vce->data;	
+			}
+			
+			listAdd_double_group(groups, flags, groupNum, curData, decValue, gp);			
+			lastGroupNum = curData>=0?groupNum + 2: -(groupNum+2);			
+		}
+		gp++;	
+
+	}
+	
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	//combineTypeAndGroupIDArray(nbBins, dataLength, &type, groupID);
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, nbBins, NULL, 0, radExpo);	
+	
+	compressGroupIDArray_double(groupID, tdps);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupID);
+	free(groupErrorBounds);
+	
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageD(tdps);	
+	
+	return tdps;
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize)
+{
+        TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_pwrGroup(oriData, dataLength, confparams_cpr->errorBoundMode, 
+        absErrBound, relBoundRatio, pwrErrRatio, 
+        valueRangeSize, medianValue_f);
+
+        convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+        if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+                SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+        free_TightDataPointStorageD(tdps);
+}
+
+#include <stdbool.h>
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double min, double max){
+
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ(log_data, dataLength, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+            SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, double min, double max){
+
+	size_t dataLength = r1 * r2;
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_2D_MDQ(log_data, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+            SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max){
+
+	size_t dataLength = r1 * r2 * r3;
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_3D_MDQ(log_data, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+            SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double valueRangeSize, double medianValue_f,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero){
+	double multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	double median_log = sqrt(fabs(nearZero * max));
+
+	TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_MSST19(oriData, dataLength, pwrErrRatio, valueRangeSize, median_log);
+
+	tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+	if(!(*positive)){
+		unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, double valueRangeSize,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero){
+
+	size_t dataLength = r1 * r2;
+
+	double multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	double median_log = sqrt(fabs(nearZero * max));
+
+    TightDataPointStorageD* tdps = SZ_compress_double_2D_MDQ_MSST19(oriData, r1, r2, pwrErrRatio, valueRangeSize, median_log);
+    tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+
+    if(!*positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+            SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, double valueRangeSize, unsigned char* signs, bool* positive, double min, double max, double nearZero){
+
+	size_t dataLength = r1 * r2 * r3;
+
+	double multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	double median_log = sqrt(fabs(nearZero * max));
+
+	TightDataPointStorageD* tdps = SZ_compress_double_3D_MDQ_MSST19(oriData, r1, r2, r3, pwrErrRatio, valueRangeSize, median_log);
+	tdps->minLogValue =  nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+
+	if(!*positive){
+		unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(double)*dataLength)
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
diff --git a/deps/SZ/sz/src/sz_double_ts.c b/deps/SZ/sz/src/sz_double_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..3c9b184ee6f515a2188b980e052575c0e0635d93
--- /dev/null
+++ b/deps/SZ/sz/src/sz_double_ts.c
@@ -0,0 +1,191 @@
+/**
+ *  @file sz_double_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_double_ts.h"
+
+unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength, double* preData, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = preData[i];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, double valueRangeSize, double medianValue_d)
+{
+	double* preStepData = (double*)(multisteps->hist_data);
+	//store the decompressed data
+	//double* decData = (double*)malloc(sizeof(double)*dataLength);
+	//memset(decData, 0, sizeof(double)*dataLength);
+	double* decData = preStepData;
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_ts(oriData, dataLength, preStepData, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));			
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[0] = vce->data;
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[1] = vce->data;	
+	
+	int state = 0;
+	double checkRadius = 0;
+	double curData = 0;
+	double pred = 0;
+	double predAbsErr = 0;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		pred = preStepData[i];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			decData[i] = pred;	
+			continue;
+		}
+		
+		//unpredictable data processing
+		type[i] = 0;		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		decData[i] = vce->data;
+	}//end of for
+		
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+		
+	//memcpy(preStepData, decData, dataLength*sizeof(double)); //update the data
+	//free(decData);
+	
+	return tdps;
+}
+
+
diff --git a/deps/SZ/sz/src/sz_float.c b/deps/SZ/sz/src/sz_float.c
new file mode 100644
index 0000000000000000000000000000000000000000..118bf11c346e52a74ed8c3c96571b23a375ae2cc
--- /dev/null
+++ b/deps/SZ/sz/src/sz_float.c
@@ -0,0 +1,10079 @@
+/**
+ *  @file sz_float.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "sz_float.h"
+#include "sz_float_pwr.h"
+#include "szd_float.h"
+#include "szd_float_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_float_ts.h"
+#include "utility.h"
+#include "CacheTable.h"
+#include "MultiLevelCacheTableWideInterval.h"
+#include "sz_stats.h"
+
+unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize)
+{
+	*outSize = dataLength*sizeof(float);
+	unsigned char* out = (unsigned char*)malloc(dataLength*sizeof(float));
+	memcpy(out, data, dataLength*sizeof(float));
+	return out;
+}
+
+void computeReqLength_float(double realPrecision, short radExpo, int* reqLength, float* medianValue)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	*reqLength = 9+radExpo - reqExpo+1; //radExpo-reqExpo == reqMantiLength
+	if(*reqLength<9)
+		*reqLength = 9;
+	if(*reqLength>32)
+	{	
+		*reqLength = 32;
+		*medianValue = 0;
+	}			
+}
+
+inline short computeReqLength_float_MSST19(double realPrecision)
+{
+	short reqExpo = getPrecisionReqLength_float(realPrecision);
+	return 9-reqExpo;
+}
+
+unsigned int optimize_intervals_float_1D(float *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D(float *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+
+	//float max = oriData[0];
+	//float min = oriData[0];
+
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+
+			//	if (max < oriData[index]) max = oriData[index];
+			//	if (min > oriData[index]) min = oriData[index];
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	//	struct timeval costStart, costEnd;
+	//	double cost_est = 0;
+	//
+	//	gettimeofday(&costStart, NULL);
+	//
+	//	//compute estimate of bit-rate and distortion
+	//	double est_br = 0;
+	//	double est_psnr = 0;
+	//	double c1 = log2(targetCount)+1;
+	//	double c2 = -20.0*log10(realPrecision) + 20.0*log10(max-min) + 10.0*log10(3);
+	//
+	//	for (i = 0; i < powerOf2/2; i++)
+	//	{
+	//		int count = intervals[i];
+	//		if (count != 0)
+	//			est_br += count*log2(count);
+	//		est_psnr += count;
+	//	}
+	//
+	//	//compute estimate of bit-rate
+	//	est_br -= c1*est_psnr;
+	//	est_br /= totalSampleSize;
+	//	est_br = -est_br;
+	//
+	//	//compute estimate of psnr
+	//	est_psnr /= totalSampleSize;
+	//	printf ("sum of P(i) = %lf\n", est_psnr);
+	//	est_psnr = -10.0*log10(est_psnr);
+	//	est_psnr += c2;
+	//
+	//	printf ("estimate bitrate = %.2f\n", est_br);
+	//	printf ("estimate psnr = %.2f\n",est_psnr);
+	//
+	//	gettimeofday(&costEnd, NULL);
+	//	cost_est = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(costStart.tv_sec*1000000+costStart.tv_usec))/1000000.0;
+	//
+	//	printf ("analysis time = %f\n", cost_est);
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+
+	//float max = oriData[0];
+	//float min = oriData[0];
+
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+
+					//	if (max < oriData[index]) max = oriData[index];
+					//	if (min > oriData[index]) min = oriData[index];
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	//	struct timeval costStart, costEnd;
+	//	double cost_est = 0;
+	//
+	//	gettimeofday(&costStart, NULL);
+	//
+	//	//compute estimate of bit-rate and distortion
+	//	double est_br = 0;
+	//	double est_psnr = 0;
+	//	double c1 = log2(targetCount)+1;
+	//	double c2 = -20.0*log10(realPrecision) + 20.0*log10(max-min) + 10.0*log10(3);
+	//
+	//	for (i = 0; i < powerOf2/2; i++)
+	//	{
+	//		int count = intervals[i];
+	//		if (count != 0)
+	//			est_br += count*log2(count);
+	//		est_psnr += count;
+	//	}
+	//
+	//	//compute estimate of bit-rate
+	//	est_br -= c1*est_psnr;
+	//	est_br /= totalSampleSize;
+	//	est_br = -est_br;
+	//
+	//	//compute estimate of psnr
+	//	est_psnr /= totalSampleSize;
+	//	printf ("sum of P(i) = %lf\n", est_psnr);
+	//	est_psnr = -10.0*log10(est_psnr);
+	//	est_psnr += c2;
+	//
+	//	printf ("estimate bitrate = %.2f\n", est_br);
+	//	printf ("estimate psnr = %.2f\n",est_psnr);
+	//
+	//	gettimeofday(&costEnd, NULL);
+	//	cost_est = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(costStart.tv_sec*1000000+costStart.tv_usec))/1000000.0;
+	//
+	//	printf ("analysis time = %f\n", cost_est);
+
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_float_4D(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ(float *oriData, 
+size_t dataLength, float realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_opt(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);	
+	int intvRadius = quantization_intervals/2;
+
+	size_t i;
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	float checkRadius;
+	float curData;
+	float pred = last3CmprsData[0];
+	float predAbsErr;
+	checkRadius = (quantization_intervals-1)*realPrecision;
+	float interval = 2*realPrecision;
+	
+	float recip_precision = 1/realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{	
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		//pred = last3CmprsData[0];
+		predAbsErr = fabsf(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = ((int)(predAbsErr*recip_precision+1))>>1;
+			if(curData>=pred)
+			{
+				type[i] = intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = intvRadius-state;
+				pred = pred - state*interval;
+			}
+				
+			//double-check the prediction error in case of machine-epsilon impact	
+			if(fabs(curData-pred)>realPrecision)
+			{	
+				type[i] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);		
+				
+				//listAdd_float(last3CmprsData, vce->data);	
+				pred = vce->data;
+#ifdef HAVE_TIMECMPR					
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[i] = vce->data;
+#endif					
+			}
+			else
+			{
+				//listAdd_float(last3CmprsData, pred);
+#ifdef HAVE_TIMECMPR					
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[i] = pred;			
+#endif	
+			}	
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		//listAdd_float(last3CmprsData, vce->data);
+		pred = vce->data;
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%zu, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_float_StoreOriData(float* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize)
+{	
+	int floatSize=sizeof(float);	
+	size_t k = 0, i;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + floatSize*dataLength;
+	/*No need to malloc because newByteData should always already be allocated with no less totalByteLength.*/
+	//*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;	
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*floatSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=floatSize)
+			floatToBytes(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+char SZ_compress_args_float_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, float *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{		
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL;	
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{	
+				tdps = SZ_compress_float_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_f);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			tdps = SZ_compress_float_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 			
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_float_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_f);	
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+	
+	free_TightDataPointStorageF(tdps);
+	return compressionType;
+}
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ(float *oriData, size_t r1, size_t r2, float realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR
+	float* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+	
+	float recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_opt(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+	
+	size_t i,j; 
+	int reqLength;
+	float pred1D, pred2D;
+	float diff = 0.0;
+	float itvNum = 0;
+	float *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (float*)malloc(r2*sizeof(float));
+	memset(P0, 0, r2*sizeof(float));
+	P1 = (float*)malloc(r2*sizeof(float));
+	memset(P1, 0, r2*sizeof(float));
+		
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	float curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+	diff = curData - pred1D;
+
+	itvNum =  fabs(diff)*recip_realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - intvRadius) * realPrecision;	
+
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(spaceFillingValue[1]-P1[1])>realPrecision)
+		{	
+			type[1] = 0;			
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+			
+			P1[1] = vce->data;
+		}		
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = spaceFillingValue[j];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - intvRadius) * realPrecision;
+		
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision)
+			{	
+				type[j] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[j] = vce->data;	
+			}
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue(vce,curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P0[0] = vce->data;	
+			}
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			curData = spaceFillingValue[index];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+			
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[j] = vce->data;	
+				}			
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+char SZ_compress_args_float_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{	
+	size_t dataLength = r1*r2;
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL; 
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{	
+				tdps = SZ_compress_float_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}					
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			tdps = SZ_compress_float_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 			
+		}
+	}
+	else
+#endif
+		tdps = SZ_compress_float_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_f);	
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+	
+	free_TightDataPointStorageF(tdps);	
+	
+	return compressionType;
+}
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif		
+
+	float recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;	
+		
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	float itvNum = 0;
+	float *P0, *P1;
+
+	size_t dataLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	P0 = (float*)malloc(r23*sizeof(float));
+	P1 = (float*)malloc(r23*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	float curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+	diff = curData - pred1D;
+
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - intvRadius) * realPrecision;
+		
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision)
+		{	
+			type[1] = 0;			
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+			
+			P1[1] = vce->data;	
+		}				
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = spaceFillingValue[j];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision)
+			{	
+				type[j] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[j] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[index] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = spaceFillingValue[index];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P1[index] = vce->data;	
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P0[0] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = spaceFillingValue[index];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[j] = vce->data;	
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = spaceFillingValue[index];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[index2D] = vce->data;	
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = spaceFillingValue[index];
+				diff = curData - pred3D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabs(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;						
+						compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+						
+						P0[index2D] = vce->data;	
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n",
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * @cmprType compressionType (SZ_FORCE_SNAPSHOT_COMPRESSION, SZ_FORCE_TEMPORAL_COMPRESSION or SZ_PEORI_TEMPORAL_COMPRESSION)
+ * 
+ * */
+char SZ_compress_args_float_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{
+	size_t dataLength = r1*r2*r3;
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL; 
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(cmprType == SZ_PERIO_TEMPORAL_COMPRESSION)
+		{
+			if(timestep % confparams_cpr->snapshotCmprStep != 0)
+			{
+				tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+				compressionType = 1; //time-series based compression 
+			}
+			else
+			{
+				if(confparams_cpr->withRegression == SZ_NO_REGRESSION)	
+					tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+				else
+					*newByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
+				compressionType = 0; //snapshot-based compression
+				multisteps->lastSnapshotStep = timestep;
+			}					
+		}
+		else if(cmprType == SZ_FORCE_SNAPSHOT_COMPRESSION)
+		{
+			if(confparams_cpr->withRegression == SZ_NO_REGRESSION)	
+				tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+			else
+				*newByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;			
+		}
+		else if(cmprType == SZ_FORCE_TEMPORAL_COMPRESSION)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 			
+		}
+	}
+	else
+#endif
+		tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+
+	if(tdps!=NULL)
+	{
+		convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+		if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+			SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+		free_TightDataPointStorageF(tdps);
+	}
+
+	return compressionType;
+}
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+	float recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t dataLength = r1*r2*r3*r4;
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (float*)malloc(r34*sizeof(float));
+	P1 = (float*)malloc(r34*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = fabs(diff)*recip_realPrecision + 1;
+
+					if (itvNum < quantization_intervals)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+						compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			float *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+char SZ_compress_args_float_NoCkRngeNoGzip_4D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, medianValue_f);
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	int dataLength = r1*r2*r3*r4;
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+	
+	return 0;
+}
+
+/*MSST19*/
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_MSST19(float *oriData, 
+size_t dataLength, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+
+	//struct ClockPoint clockPointBuild;
+	//TimeDurationStart("build", &clockPointBuild);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_opt_MSST19(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);
+	int intvRadius = quantization_intervals/2;
+	
+	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+    for(int i=0; i<quantization_intervals; i++){
+        double test = pow((1+realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+//        if(i>30000 && i<40000)
+//			printf("%d %.30G\n", i, test);
+    }
+    //float smallest_precision = precisionTable[0], largest_precision = precisionTable[quantization_intervals-1];
+	struct TopLevelTableWideInterval levelTable;
+    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+	size_t i;
+	int reqLength;
+	float medianValue = medianValue_f;
+	//float medianInverse = 1 / medianValue_f;
+	//short radExpo = getExponent_float(valueRangeSize/2);
+	
+	reqLength = computeReqLength_float_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, dataLength/2/8);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, dataLength/2);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	//size_t miss=0, hit=0;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue_MSST19(vce, spaceFillingValue[1], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	//double checkRadius;
+	float curData;
+	float pred = vce->data;
+
+    double predRelErrRatio;
+
+	const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+	const uint64_t range = top - base;
+	const int bits = levelTable.bits;
+	uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+	const int shift = 52-bits;
+	uint64_t expoIndex, mantiIndex;
+	uint16_t* tables[range+1];
+	for(int i=0; i<=range; i++){
+		tables[i] = levelTable.subTables[i].table;
+	}
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		predRelErrRatio = curData / pred;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if(state)
+		{
+			type[i] = state;
+			pred *= precisionTable[state];
+			//hit++;
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		pred =  vce->data;
+		//miss++;
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+//	printf("miss:%d, hit:%d\n", miss, hit);
+
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+    tdps->plus_bits = confparams_cpr->plus_bits;
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR
+	float* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_opt_MSST19(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+	double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+	for(int i=0; i<quantization_intervals; i++){
+		double test = pow((1+realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+	//double smallest_precision = precisionTable[0], largest_precision = precisionTable[quantization_intervals-1];
+	struct TopLevelTableWideInterval levelTable;
+	MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+	size_t i,j; 
+	int reqLength;
+	float pred1D, pred2D;
+	//float diff = 0.0;
+	//double itvNum = 0;
+	float *P0, *P1;
+	double predRelErrRatio;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (float*)malloc(r2*sizeof(float));
+	memset(P0, 0, r2*sizeof(float));
+	P1 = (float*)malloc(r2*sizeof(float));
+	memset(P1, 0, r2*sizeof(float));
+		
+	float medianValue = medianValue_f;
+	//float medianValueInverse = 1 / medianValue_f;
+	//short radExpo = getExponent_float(valueRangeSize/2);
+	reqLength = computeReqLength_double_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+    const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+    const uint64_t range = top - base;
+    const int bits = levelTable.bits;
+    uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+    const int shift = 52-bits;
+    uint64_t expoIndex, mantiIndex;
+    uint16_t* tables[range+1];
+    for(int i=0; i<=range; i++){
+        tables[i] = levelTable.subTables[i].table;
+    }
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	float curData;
+	int state;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+
+	curData = spaceFillingValue[1];
+	predRelErrRatio = curData / pred1D;
+
+	expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+	if(expoIndex <= range){
+		mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+		state = tables[expoIndex][mantiIndex];
+	}else{
+		state = 0;
+	}
+
+	if (state)
+	{
+		type[1] = state;
+		P1[1] = fabs(pred1D) * precisionTable[state];
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = P1[j-1] * P1[j-1] / P1[j-2];
+		curData = spaceFillingValue[j];
+		predRelErrRatio = curData / pred1D;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if (state)
+		{
+			type[j] = state;
+			P1[j] = fabs(pred1D) * precisionTable[state];
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		predRelErrRatio = curData / pred1D;
+
+		expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+		if(expoIndex <= range){
+			mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+			state = tables[expoIndex][mantiIndex];
+		}else{
+			state = 0;
+		}
+
+		if (state)
+		{
+			type[index] = state;
+			P0[0] = fabs(pred1D) * precisionTable[state];
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] * P1[j] / P1[j-1];
+
+			curData = spaceFillingValue[index];
+			predRelErrRatio = curData / pred2D;
+
+			expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+			if(expoIndex <= range){
+				mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+				state = tables[expoIndex][mantiIndex];
+			}else{
+				state = 0;
+			}
+
+			if (state)
+			{
+				type[index] = state;
+				P0[j] = fabs(pred2D) * precisionTable[state];
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	tdps->plus_bits = confparams_cpr->plus_bits;
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);	
+	return tdps;	
+}
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif		
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_opt_MSST19(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+    double* precisionTable = (double*)malloc(sizeof(double) * quantization_intervals);
+    double inv = 2.0-pow(2, -(confparams_cpr->plus_bits));
+    for(int i=0; i<quantization_intervals; i++){
+        double test = pow((1+realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+    //double smallest_precision = precisionTable[0], largest_precision = precisionTable[quantization_intervals-1];
+    struct TopLevelTableWideInterval levelTable;
+    MultiLevelCacheTableWideIntervalBuild(&levelTable, precisionTable, quantization_intervals, realPrecision, confparams_cpr->plus_bits);
+
+    size_t i,j,k;
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	//float diff = 0.0;
+	//double itvNum = 0;
+	float *P0, *P1;
+    double predRelErrRatio;
+
+	size_t dataLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	P0 = (float*)malloc(r23*sizeof(float));
+	P1 = (float*)malloc(r23*sizeof(float));
+
+	float medianValue = medianValue_f;
+	//float medianValueInverse = 1/ medianValue_f;
+	//short radExpo = getExponent_float(valueRangeSize/2);
+	reqLength = computeReqLength_float_MSST19(realPrecision);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+    const uint64_t top = levelTable.topIndex, base = levelTable.baseIndex;
+    const uint64_t range = top - base;
+    const int bits = levelTable.bits;
+    uint64_t* const buffer = (uint64_t*)&predRelErrRatio;
+    const int shift = 52-bits;
+    uint64_t expoIndex, mantiIndex;
+    uint16_t* tables[range+1];
+    for(int i=0; i<=range; i++){
+        tables[i] = levelTable.subTables[i].table;
+    }
+    int state;
+
+    double temp, temp2;
+
+
+    //size_t miss=0, hit=0;
+
+    ///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue_MSST19(vce, spaceFillingValue[0], realPrecision, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+	//miss++;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	float curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+    predRelErrRatio = curData / pred1D;
+
+    expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+    if(expoIndex <= range){
+        mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+        state = tables[expoIndex][mantiIndex];
+    }else{
+        state = 0;
+    }
+
+	if (state)
+	{
+		type[1] = state;
+		P1[1] = fabsf(pred1D) * precisionTable[state];
+		//hit++;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+		//miss++;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		temp = P1[j-1];
+		pred1D = temp * temp / P1[j-2];
+		curData = spaceFillingValue[j];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+        if (state)
+		{
+			type[j] = state;
+			P1[j] = fabsf(pred1D) * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = spaceFillingValue[index];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+		if (state)
+		{
+			type[index] = state;
+			P1[index] = pred1D * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			temp = P1[index-1];
+			pred2D = temp * P1[index-r3] / P1[index-r3-1];
+			//float a = P1[index-1];
+			//float b = P1[index-r3];
+			//float c = P1[index-r3-1];
+
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				//float temp1 = precisionTable[state];
+				//float temp = fabsf(pred2D) * precisionTable[state];
+				P1[index] = fabsf(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+        predRelErrRatio = curData / pred1D;
+
+        expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+        if(expoIndex <= range){
+            mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+            state = tables[expoIndex][mantiIndex];
+        }else{
+            state = 0;
+        }
+
+		if (state)
+		{
+			type[index] = state;
+			P0[0] = fabsf(pred1D) * precisionTable[state];
+			//hit++;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+			//miss++;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			temp = P0[j-1];
+			pred2D = temp * P1[j] / P1[j-1];
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				P0[j] = fabsf(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;
+			temp = P0[index2D-r3];
+			pred2D = temp * P1[index2D] / P1[index2D-r3];
+			curData = spaceFillingValue[index];
+            predRelErrRatio = curData / pred2D;
+
+            expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+            if(expoIndex <= range){
+                mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                state = tables[expoIndex][mantiIndex];
+            }else{
+                state = 0;
+            }
+
+			if (state)
+			{
+				type[index] = state;
+				P0[index2D] = fabsf(pred2D) * precisionTable[state];
+				//hit++;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+				//miss++;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index ++;
+				index2D = i*r3 + j;
+				//pred3D = P0[index2D-1] * P0[index2D-r3] * P1[index2D] / P0[index2D-r3-1] / P1[index2D-r3] / P1[index2D-1] * P1[index2D-r3-1];
+				temp = P0[index2D-1];
+				temp2 = P0[index2D-r3-1];
+                pred3D = temp * P0[index2D-r3] * P1[index2D] * P1[index2D-r3-1] / (temp2 * P1[index2D-r3] * P1[index2D-1]);
+
+				curData = spaceFillingValue[index];
+                predRelErrRatio = curData / pred3D;
+
+                expoIndex = ((*buffer & 0x7fffffffffffffff) >> 52) - base;
+                if(expoIndex <= range){
+                    mantiIndex = (*buffer & 0x000fffffffffffff) >> shift;
+                    state = tables[expoIndex][mantiIndex];
+                }else{
+                    state = 0;
+                }
+
+				if (state)
+				{
+					type[index] = state;
+					P0[index2D] = fabsf(pred3D) * precisionTable[state];
+					//hit++;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue_MSST19(vce, curData, realPrecision, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+					//miss++;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	tdps->plus_bits = confparams_cpr->plus_bits;
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	free(precisionTable);
+	freeTopLevelTableWideInterval(&levelTable);	
+	return tdps;	
+}
+
+
+void SZ_compress_args_float_withinRange(unsigned char** newByteData, float *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageF* tdps = (TightDataPointStorageF*) malloc(sizeof(TightDataPointStorageF));
+	tdps->rtypeArray = NULL;
+	tdps->typeArray = NULL;	
+	tdps->leadNumArray = NULL;
+	tdps->residualMidBits = NULL;
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->pwrErrBoundBytes = NULL;
+	tdps->isLossless = 0;
+	float value = oriData[0];
+	floatToBytes(tdps->exactMidBytes, value);
+	tdps->exactMidBytes_size = 4;
+	
+	size_t tmpOutSize;
+	//unsigned char *tmpByteData;
+	convertTDPStoFlatBytes_float(tdps, newByteData, &tmpOutSize);
+
+	//*newByteData = (unsigned char*)malloc(sizeof(unsigned char)*12); //for floating-point data (1+3+4+4)
+	//memcpy(*newByteData, tmpByteData, 12);
+	*outSize = tmpOutSize; //8+SZ_SIZE_TYPE; //8==3+1+4(float_size)
+	free_TightDataPointStorageF(tdps);	
+}
+
+/*
+int SZ_compress_args_float_wRngeNoGzip(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	float valueRangeSize = 0, medianValue = 0;
+	
+	float min = computeRangeSize_float(oriData, dataLength, &valueRangeSize, &medianValue);
+	float max = min+valueRangeSize;
+	double realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_float_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{	
+				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r1, outSize, min, max);
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
+			}
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r4*r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+	}
+	return status;
+}
+*/
+
+int SZ_compress_args_float(int cmprType, int withRegression, unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode; //this is used to print the metadata if needed...
+	if(errBoundMode==PW_REL)
+	{
+		confparams_cpr->pw_relBoundRatio = pwRelBoundRatio;	
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataLength <= MIN_NUM_OF_ELEMENTS)
+	{
+		*newByteData = SZ_skip_compress_float(oriData, dataLength, outSize);
+		return status;
+	}
+	
+	float valueRangeSize = 0, medianValue = 0;
+	
+	unsigned char * signs = NULL;
+	bool positive = true;
+	float nearZero = 0.0;
+	float min = 0;
+	if(pwRelBoundRatio < 0.000009999)
+		confparams_cpr->accelerate_pw_rel_compression = 0;
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+	{
+		signs = (unsigned char *) malloc(dataLength);
+		memset(signs, 0, dataLength);
+		min = computeRangeSize_float_MSST19(oriData, dataLength, &valueRangeSize, &medianValue, signs, &positive, &nearZero);
+	}
+	else
+		min = computeRangeSize_float(oriData, dataLength, &valueRangeSize, &medianValue);	
+	float max = min+valueRangeSize;
+	confparams_cpr->fmin = min;
+	confparams_cpr->fmax = max;
+	
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else if(confparams_cpr->errorBoundMode==NORM) //norm error = sqrt(sum((xi-xi_)^2))
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromNORM_ERR(confparams_cpr->normErr, dataLength);
+		//printf("realPrecision=%lf\n", realPrecision);				
+	}
+	else
+	{
+		realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		confparams_cpr->absErrBound = realPrecision;
+	}	
+	if(valueRangeSize <= realPrecision)
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL && confparams_cpr->accelerate_pw_rel_compression == 1)
+			free(signs);		
+		SZ_compress_args_float_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		
+		if (r2==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, valueRangeSize, medianValue, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, min, max);
+					//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, valueRangeSize, medianValue, &tmpOutSize);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_1D(cmprType, &tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif				
+					{
+#ifdef HAVE_RANDOMACCESS						
+						if(confparams_cpr->randomAccess == 0)
+						{
+#endif							
+							SZ_compress_args_float_NoCkRngeNoGzip_1D(cmprType, &tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+							if(tmpOutSize>=dataLength*sizeof(float) + 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1)
+								SZ_compress_args_float_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);
+#ifdef HAVE_RANDOMACCESS
+						}
+						else
+							tmpByteData = SZ_compress_float_1D_MDQ_decompression_random_access_with_blocked_regression(oriData, r1, realPrecision, &tmpOutSize);			
+#endif							
+					}
+		}
+		else
+		if (r3==0)
+		{			
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, min, max);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_2D(cmprType, &tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+				{
+#ifdef HAVE_RANDOMACCESS															
+					if(confparams_cpr->randomAccess == 0)
+					{
+#endif							
+						if(withRegression == SZ_NO_REGRESSION)
+							SZ_compress_args_float_NoCkRngeNoGzip_2D(cmprType, &tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+						else // SZ 2.1 (2D)
+						{
+							tmpByteData = SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize);//SZ 2.1 (2D)
+							if(tmpOutSize>=dataLength*sizeof(float) + 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1)
+								SZ_compress_args_float_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);						
+						}
+#ifdef HAVE_RANDOMACCESS							
+					}					
+					else 
+						tmpByteData = SZ_compress_float_2D_MDQ_decompression_random_access_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize); 
+#endif	
+				}
+		}
+		else
+		if (r4==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, min, max);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+						multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(cmprType, &tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+				{
+#ifdef HAVE_RANDOMACCESS															
+					if(confparams_cpr->randomAccess == 0)
+					{
+#endif						
+						if(withRegression == SZ_NO_REGRESSION)
+							SZ_compress_args_float_NoCkRngeNoGzip_3D(cmprType, &tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+						else  //SZ 2.1 (3D)
+						{
+							tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize); //SZ 2.1 (3D)
+							if(tmpOutSize>=dataLength*sizeof(float) + 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1)
+								SZ_compress_args_float_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);		
+						}
+#ifdef HAVE_RANDOMACCESS							
+					}					
+					else
+						tmpByteData = SZ_compress_float_3D_MDQ_decompression_random_access_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize);	
+#endif					
+				}
+		}
+		else
+		if (r5==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				if(confparams_cpr->accelerate_pw_rel_compression && confparams_cpr->maxRangeRadius <= 32768)
+					SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, valueRangeSize, signs, &positive, min, max, nearZero);
+				else
+					SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, min, max);				
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+				{
+					if(withRegression == SZ_NO_REGRESSION)
+						SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+					{
+						tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r4*r3, r2, r1, realPrecision, &tmpOutSize); //SZ 2.1 4D
+						if(tmpOutSize>=dataLength*sizeof(float) + 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1)
+							SZ_compress_args_float_StoreOriData(oriData, dataLength, &tmpByteData, &tmpOutSize);						
+					}
+				}
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Zstd or Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION || confparams_cpr->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the float compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
+
+//TODO
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	float valueRangeSize = 0, medianValue = 0;
+	computeRangeSize_float_subblock(oriData, &valueRangeSize, &medianValue, r5, r4, r3, r2, r1, s5, s4, s3, s2, s1, e5, e4, e3, e2, e1);
+
+	double realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		//TODO
+		//SZ_compress_args_float_withinRange_subblock();
+	}
+	else
+	{
+		if (r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_1D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r1, s1, e1);
+		}
+		else
+		if (r3==0)
+		{
+			//TODO
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_2D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r2, r1, s2, s1, e2, e1);
+		}
+		else
+		if (r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_3D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+		}
+		else
+		if (r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_4D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_4D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+	}
+	return status;
+}
+
+void SZ_compress_args_float_NoCkRnge_1D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r1, s1, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_2D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_2D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r2, r1, s2, s1, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_3D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_3D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_4D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_4D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+
+}
+
+unsigned int optimize_intervals_float_1D_subblock(float *oriData, double realPrecision, size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	oriData = oriData + s1;
+
+	size_t i = 0;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = 2*oriData[i-1] - oriData[i-2];
+			//pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+
+	size_t i,j, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_3D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+
+	size_t r23 = r2*r3;
+
+	size_t i,j,k, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23]
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_4D_subblock(float *oriData, double realPrecision,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+
+	size_t i,j,k,l, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3*R4/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				for (l=s4+1;l<=e4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r4] + oriData[index-r34]
+									- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_subblock(oriData, realPrecision, r1, s1, e1);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//updateQuantizationInfo(quantization_intervals);
+	int intvRadius = quantization_intervals/2;
+
+	size_t i; 
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData + s1;
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	//add the first data
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+
+	int state;
+	double checkRadius;
+	float curData;
+	float pred;
+	float predAbsErr;
+	checkRadius = (quantization_intervals-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		predAbsErr = fabs(curData - pred);
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else
+			{
+				type[i] = intvRadius-state;
+				pred = pred - state*interval;
+			}
+
+			listAdd_float(last3CmprsData, pred);
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_float(last3CmprsData, vce->data);
+	}
+
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_subblock(oriData, realPrecision, r1, r2, s1, s2, e1, e2);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j; 
+	int reqLength;
+	float pred1D, pred2D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t dataLength = R1*R2;
+
+	P0 = (float*)malloc(R2*sizeof(float));
+	memset(P0, 0, R2*sizeof(float));
+	P1 = (float*)malloc(R2*sizeof(float));
+	memset(P1, 0, R2*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	/* Process Row-s1 data s2*/
+	size_t gIndex;
+	size_t lIndex;
+
+	gIndex = s1*r2+s2;
+	lIndex = 0;
+
+	type[lIndex] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-s1 data s2+1*/
+	gIndex = s1*r2+(s2+1);
+	lIndex = 1;
+
+	pred1D = P1[0];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + intvRadius;
+		P1[1] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-s1 data s2+2 --> data e2 */
+	for (j = 2; j < R2; j++)
+	{
+		gIndex = s1*r2+(s2+j);
+		lIndex = j;
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[j] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-s1+1 --> Row-e1 */
+	for (i = 1; i < R1; i++)
+	{
+		/* Process row-s1+i data s2 */
+		gIndex = (s1+i)*r2+s2;
+		lIndex = i*R2;
+
+		pred1D = P1[0];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P0[0] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+		/* Process row-s1+i data s2+1 --> e2 */
+		for (j = 1; j < R2; j++)
+		{
+			gIndex = (s1+i)*r2+(s2+j);
+			lIndex = i*R2+j;
+
+//			printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[j] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_subblock(oriData, realPrecision, r1, r2, r3, s1, s2, s3, e1, e2, e3);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t dataLength = R1*R2*R3;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+
+	P0 = (float*)malloc(R23*sizeof(float));
+	P1 = (float*)malloc(R23*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-s1 ///////////////////////////
+	/* Process Row-s2 data s3*/
+	size_t gIndex; 	//global index
+	size_t lIndex; 	//local index
+	size_t index2D; 	//local 2D index
+
+	gIndex = s1*r23+s2*r3+s3;
+	lIndex = 0;
+	index2D = 0;
+
+	type[lIndex] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[index2D] = vce->data;
+
+	/* Process Row-s2 data s3+1*/
+	gIndex = s1*r23+s2*r3+s3+1;
+	lIndex = 1;
+	index2D = 1;
+
+	pred1D = P1[index2D-1];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < quantization_intervals)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + intvRadius;
+		P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+	}
+
+    /* Process Row-s2 data s3+2 --> data e3 */
+	for (j = 2; j < R3; j++)
+	{
+		gIndex = s1*r23+s2*r3+s3+j;
+		lIndex = j;
+		index2D = j;
+
+		pred1D = 2*P1[index2D-1] - P1[index2D-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+	}
+
+	/* Process Row-s2+1 --> Row-e2 */
+	for (i = 1; i < R2; i++)
+	{
+		/* Process row-s2+i data s3 */
+		gIndex = s1*r23+(s2+i)*r3+s3;
+		lIndex = i*R3;
+		index2D = i*R3;
+
+		pred1D  = P1[index2D-R3];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process row-s2+i data s3+1 --> data e3*/
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = s1*r23+(s2+i)*r3+s3+j;
+			lIndex = i*R3+j;
+			index2D = i*R3+j;
+
+			pred2D  = P1[index2D-1] + P1[index2D-R3] - P1[index2D-R3-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-s1+1 --> layer-e1 ///////////////////////////
+
+	for (k = 1; k < R1; k++)
+	{
+		/* Process Row-s2 data s3*/
+		gIndex = (s1+k)*r23+s2*r3+s3;
+		lIndex = k*R23;
+		index2D = 0;
+
+		pred1D = P1[index2D];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P0[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[index2D] = vce->data;
+		}
+
+	    /* Process Row-s2 data s3+1 --> data e3 */
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = (s1+k)*r23+s2*r3+s3+j;
+			lIndex = k*R23+j;
+			index2D = j;
+
+			pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+		}
+
+	    /* Process Row-s2+1 --> Row-e2 */
+		for (i = 1; i < R2; i++)
+		{
+			/* Process Row-s2+i data s3 */
+			gIndex = (s1+k)*r23+(s2+i)*r3+s3;
+			lIndex = k*R23+i*R3;
+			index2D = i*R3;
+
+			pred2D = P0[index2D-R3] + P1[index2D] - P1[index2D-R3];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s2+i data s3+1 --> data e3 */
+			for (j = 1; j < R3; j++)
+			{
+				gIndex = (s1+k)*r23+(s2+i)*r3+s3+j;
+				lIndex = k*R23+i*R3+j;
+				index2D = i*R3+j;
+
+//				printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+				pred3D = P0[index2D-1] + P0[index2D-R3]+ P1[index2D] - P0[index2D-R3-1] - P1[index2D-R3] - P1[index2D-1] + P1[index2D-R3-1];
+				diff = spaceFillingValue[gIndex] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred3D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_4D_subblock(oriData, realPrecision, r1, r2, r3, r4, s1, s2, s3, s4, e1, e2, e3, e4);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	int intvRadius = quantization_intervals/2;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t dataLength = R1*R2*R3*R4;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t R34 = R3*R4;
+	size_t R234 = R2*R3*R4;
+
+	P0 = (float*)malloc(R34*sizeof(float));
+	P1 = (float*)malloc(R34*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < R1; l++)
+	{
+
+		///////////////////////////	Process layer-s2 ///////////////////////////
+		/* Process Row-s3 data s4*/
+		size_t gIndex; 	//global index
+		size_t lIndex; 	//local index
+		size_t index2D; 	//local 2D index
+
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4;
+		lIndex = l*R234;
+		index2D = 0;
+
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-s3 data s4+1*/
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+1;
+		lIndex = l*R234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < quantization_intervals)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-s3 data s4+2 --> data e4 */
+		for (j = 2; j < R4; j++)
+		{
+			gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+j;
+			lIndex = l*R234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-s3+1 --> Row-e3 */
+		for (i = 1; i < R3; i++)
+		{
+			/* Process row-s2+i data s3 */
+			gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4;
+			lIndex = l*R234+i*R4;
+			index2D = i*R4;
+
+			pred1D  = P1[index2D-R4];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-s3+i data s4+1 --> data e4*/
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4+j;
+				lIndex = l*R234+i*R4+j;
+				index2D = i*R4+j;
+
+				pred2D  = P1[index2D-1] + P1[index2D-R4] - P1[index2D-R4-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P1[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-s2+1 --> layer-e2 ///////////////////////////
+
+		for (k = 1; k < R2; k++)
+		{
+			/* Process Row-s3 data s4*/
+			gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4;
+			lIndex = l*R234+k*R34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < quantization_intervals)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + intvRadius;
+				P0[index2D] = pred1D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s3 data s4+1 --> data e4 */
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4+j;
+				lIndex = l*R234+k*R34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-s3+1 --> Row-e3 */
+			for (i = 1; i < R3; i++)
+			{
+				/* Process Row-s3+i data s4 */
+				gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4;
+				lIndex = l*R234+k*R34+i*R4;
+				index2D = i*R4;
+
+				pred2D = P0[index2D-R4] + P1[index2D] - P1[index2D-R4];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < quantization_intervals)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-s3+i data s4+1 --> data e4 */
+				for (j = 1; j < R4; j++)
+				{
+					gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4+j;
+					lIndex = l*R234+k*R34+i*R4+j;
+					index2D = i*R4+j;
+
+//					printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+					pred3D = P0[index2D-1] + P0[index2D-R4]+ P1[index2D] - P0[index2D-R4-1] - P1[index2D-R4] - P1[index2D-1] + P1[index2D-R4-1];
+					diff = spaceFillingValue[gIndex] - pred3D;
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < quantization_intervals)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[lIndex] = (int) (itvNum/2) + intvRadius;
+						P0[index2D] = pred3D + 2 * (type[lIndex] - intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[lIndex] = 0;
+						compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			float *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+unsigned int optimize_intervals_float_1D_opt_MSST19(float *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0;
+	double pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	float * data_pos = oriData + 2;
+	float divider = log2(1+realPrecision)*2;
+	int tempIndex = 0;
+	while(data_pos - oriData < dataLength){
+		if(*data_pos == 0){
+    		data_pos += confparams_cpr->sampleDistance;
+            continue;
+		}	
+	    tempIndex++;
+		totalSampleSize++;
+		pred_value = data_pos[-1];
+		pred_err = fabs((double)*data_pos / pred_value);
+		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D_opt_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r2 + offset_count;
+	float divider = log2(1+realPrecision)*2;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		if(*data_pos == 0){
+        	data_pos += confparams_cpr->sampleDistance;
+        	continue;
+		}		
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value / *data_pos);
+		radiusIndex = (unsigned long)fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_3D_opt_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r23 + r3 + offset_count;
+	float divider = log2(1+realPrecision)*2;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		if(*data_pos == 0){
+    		data_pos += confparams_cpr->sampleDistance;
+        	continue;
+		}		
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabsf(*data_pos / pred_value);
+		radiusIndex = fabs(log2(pred_err)/divider+0.5);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	free(intervals);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	free(intervals);
+	return powerOf2;
+}
+
+size_t SZ_compress_float_3D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float realPrecision, float * P0, float * P1, int * type, float * unpredictable_data){
+
+	float recip_realPrecision = 1/realPrecision;
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	// data_pos = block_ori_data;
+	// for(size_t i=0; i<block_dim_0; i++){
+	// 	for(size_t j=0; j<block_dim_1; j++){
+	// 		for(size_t k=0; k<block_dim_2; k++){
+	// 			sum += *data_pos;
+	// 			data_pos ++;
+	// 		}
+	// 		data_pos += dim1_offset - block_dim_2;
+	// 	}
+	// 	data_pos += dim0_offset - block_dim_1 * dim1_offset;
+	// }
+	// size_t num_elements = block_dim_0 * block_dim_1 * block_dim_2;
+	// if(num_elements > 0) mean[0] = sum / num_elements;
+	// else mean[0] = 0.0;
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	float * cur_data_pos = block_ori_data;
+	float curData;
+	float pred1D, pred2D, pred3D;
+	float itvNum;
+	float diff;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	// Process Row-0 data 0
+	pred1D = mean[0];
+	curData = *cur_data_pos;
+	diff = curData - pred1D;
+	itvNum = fabsf(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabsf(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = cur_data_pos[1];
+	diff = curData - pred1D;
+	itvNum = fabsf(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabsf(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = cur_data_pos[j];
+		diff = curData - pred1D;
+		itvNum = fabsf(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabsf(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+
+		itvNum = fabsf(diff)*recip_realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabsf(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;
+				P1[index] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P1[index] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+
+			itvNum = fabsf(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabsf(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;
+					P1[index] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P1[index] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+		itvNum = fabsf(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabsf(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+			itvNum = fabsf(diff)*recip_realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabsf(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+
+		cur_data_pos += dim1_offset;
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = *cur_data_pos;
+			diff = curData - pred2D;
+
+			itvNum = fabsf(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabsf(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[index2D] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[index2D] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = cur_data_pos[j];
+				diff = curData - pred3D;
+
+				itvNum = fabsf(diff)*recip_realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabsf(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;
+						P0[index2D] = curData;	
+						unpredictable_data[unpredictable_count ++] = curData;
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					P0[index2D] = curData;
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	return unpredictable_count;
+}
+
+
+unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r2 + offset_count;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	float * data_pos = oriData + 2;
+	while(data_pos - oriData < dataLength){
+		totalSampleSize++;
+		pred_value = data_pos[-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+
+
+size_t SZ_compress_float_1D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data){
+
+	mean[0] = block_ori_data[0];
+	unsigned short unpredictable_count = 0;
+
+	float curData;
+	double itvNum;
+	double diff;
+	float last_over_thres = mean[0];
+	float pred1D;
+	size_t type_index = 0;
+	float * data_pos = block_ori_data;
+	for(size_t i=0; i<block_dim_0; i++){
+		curData = *data_pos;
+
+		pred1D = last_over_thres;
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[type_index] = (int) (itvNum/2) + exe_params->intvRadius;	
+			last_over_thres = pred1D + 2 * (type[type_index] - exe_params->intvRadius) * realPrecision;
+			if(fabs(curData-last_over_thres)>realPrecision){
+				type[type_index] = 0;
+				last_over_thres = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+		}
+		else{
+			type[type_index] = 0;
+			unpredictable_data[unpredictable_count ++] = curData;
+			last_over_thres = curData;
+		}
+		type_index ++;
+		data_pos ++;
+	}
+	return unpredictable_count;
+
+}
+
+size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data){
+
+	size_t dim0_offset = dim_1;
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+
+	float * cur_data_pos = block_ori_data;
+	float curData;
+	float pred1D, pred2D;
+	double itvNum;
+	double diff;
+	size_t i, j;
+	/* Process Row-0 data 0*/
+	curData = *cur_data_pos;
+	pred1D = mean[0];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	curData = cur_data_pos[1];
+	pred1D = P1[0];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		curData = cur_data_pos[j];
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim0_offset;
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		curData = *cur_data_pos;
+		pred1D = P1[0];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision){	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			curData = cur_data_pos[j];
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = curData - pred2D;
+			itvNum = fabs(diff)/realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim0_offset;
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	return unpredictable_count;
+}
+
+/*The above code is for sz 1.4.13; the following code is for sz 2.0*/
+static unsigned int optimize_intervals_float_1D_with_freq_and_dense_pos(float *oriData, size_t r1, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{	
+	float mean = 0.0;
+	size_t len = r1;
+	size_t mean_distance = (int) (sqrt(len));
+
+	float * data_pos = oriData;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	float predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	float mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t sample_count = 0;
+	data_pos = oriData + 1;
+	while(data_pos - oriData < len){
+		pred_value = data_pos[-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=maxRangeRadius)
+			radiusIndex = maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		data_pos += sampleDistance;
+		sample_count ++;
+	}
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{	
+	float mean = 0.0;
+	size_t len = r1 * r2;
+	size_t mean_distance = (int) (sqrt(len));
+
+	float * data_pos = oriData;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	float predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	float mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t n1_count = 1;
+	size_t offset_count = sampleDistance - 1;
+	size_t offset_count_2 = 0;
+	size_t sample_count = 0;
+	data_pos = oriData + r2 + offset_count;
+	while(data_pos - oriData < len){
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=maxRangeRadius)
+			radiusIndex = maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % sampleDistance;
+			data_pos += (r2 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+// 2D:  modified for higher performance
+#define MIN(a, b) a<b? a : b
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, float realPrecision, size_t * comp_size){
+
+	float recip_realPrecision = 1/realPrecision;
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	// calculate block dims
+	size_t num_x, num_y;
+	size_t block_size = 16;
+
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y;
+	size_t num_blocks = num_x * num_y;
+	size_t num_elements = r1 * r2;
+
+	size_t dim0_offset = r2;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+
+			data_pos = oriData + offset_x * dim0_offset + offset_y;
+
+			{
+				float * cur_data_pos = data_pos;
+				float fx = 0.0;
+				float fy = 0.0;
+				float f = 0;
+				float sum_x; 
+				float curData;
+				for(size_t i=0; i<current_blockcount_x; i++){
+					sum_x = 0;
+					for(size_t j=0; j<current_blockcount_y; j++){
+						curData = *cur_data_pos;
+						sum_x += curData;
+						fy += curData * j;
+						cur_data_pos ++;
+					}
+					fx += sum_x * i;
+					f += sum_x;
+					cur_data_pos += dim0_offset - current_blockcount_y;
+				}
+				float coeff = 1.0 / (current_blockcount_x * current_blockcount_y);
+				reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+				reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+				reg_params_pos[params_offset_c] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2);
+			}
+
+			reg_params_pos ++;
+		}
+	}
+
+	//Compress coefficient arrays
+	float precision_a, precision_b, precision_c;
+	float rel_param_err = 0.15/3;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision;
+
+	float mean = 0;
+	use_mean = 0;
+	if(use_mean){
+		// compute mean
+		float sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabsf(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+
+	float tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim0_offset = strip_dim_1;
+	unsigned char * indicator_pos = indicator;
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+	float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	float * cur_pb_buf = prediction_buffer_1;
+	float * next_pb_buf = prediction_buffer_2;
+	float * cur_pb_buf_pos;
+	float * next_pb_buf_pos;
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;
+	int use_reg = 0;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[3] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	float * coeff_unpred_data[3];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*3*sizeof(float));
+	float precision[3], recip_precision[3];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	//compute the recip_precision
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c;	
+	
+	for(int i=0; i<3; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[3] = {0};
+	float noise = realPrecision * 0.81;
+	if(use_mean){
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			float * pb_pos = cur_pb_buf_pos;
+			float * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				
+				/*sampling: decide which predictor to use (regression or lorenzo)*/
+				{
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int bmi = 0;
+					int block_size = MIN(current_blockcount_x, current_blockcount_y);
+					for(int i=1; i<block_size; i++){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+						err_reg += fabsf(pred_reg - curData);
+
+						bmi = block_size - i;
+						cur_data_pos = data_pos + i*dim0_offset + bmi;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+						err_reg += fabsf(pred_reg - curData);								
+					}
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						float cur_coeff;
+						float diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabsf(diff)*recip_precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					float curData;
+					float pred;
+					float itvNum;
+					float diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_pb_pos = pb_pos;
+					float * cur_data_pos = data_pos;
+					float curData;
+					float pred2D;
+					float itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabsf(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabsf(diff)*recip_realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabsf(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabsf(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabsf(diff)*recip_realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabsf(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}// end use mean
+	else{
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			float * pb_pos = cur_pb_buf_pos;
+			float * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				/*sampling*/
+				{
+					// sample [2i + 1, 2i + 1] [2i + 1, bs - 2i]
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int bmi = 0;
+					int block_size = MIN(current_blockcount_x, current_blockcount_y);
+					for(int i=1; i<block_size; i++){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];							
+						err_sz += fabsf(pred_sz - curData) + noise;
+						err_reg += fabsf(pred_reg - curData);
+
+						bmi = block_size - i;
+						cur_data_pos = data_pos + i*dim0_offset + bmi;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += fabsf(pred_sz - curData) + noise;
+						err_reg += fabsf(pred_reg - curData);								
+					}
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						float cur_coeff;
+						float diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabsf(diff)/precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					float curData;
+					float pred;
+					float itvNum;
+					float diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_pb_pos = pb_pos;
+					float * cur_data_pos = data_pos;
+					float curData;
+					float pred2D;
+					float itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabsf(diff)*recip_realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabsf(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i		
+	}
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	size_t i = 0;
+	init(huffmanTree, result_type, num_elements);
+	for (i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements   real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(float) + sizeof(int) + sizeof(int) + 5*treeByteSize + 3*num_blocks*sizeof(int) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	sizeToBytes(result_pos, num_elements);
+	result_pos += exe_params->SZ_SIZE_TYPE;
+	
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	floatToBytes(result_pos, realPrecision);
+	result_pos += sizeof(float);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream 	
+	if(reg_count>0){
+		for(int e=0; e<3; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			floatToBytes(result_pos, precision[e]);
+			result_pos += sizeof(float);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	
+#ifdef HAVE_WRITESTATS
+	writeHuffmanInfo(treeByteSize, typeArray_size, num_elements*sizeof(float), nodeCount);
+	writeBlockInfo(use_mean, block_size, reg_count, num_blocks);
+	writeUnpredictDataCounts(total_unpred, num_elements);
+#endif	
+
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+
+	return result;
+}
+
+
+
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{	
+	float mean = 0.0;
+	size_t len = r1 * r2 * r3;
+	size_t mean_distance = (int) (sqrt(len));
+	float * data_pos = oriData;
+	size_t offset_count = 0;
+	size_t offset_count_2 = 0;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+		offset_count += mean_distance;
+		offset_count_2 += mean_distance;
+		if(offset_count >= r3){
+			offset_count = 0;
+			data_pos -= 1;
+		}
+		if(offset_count_2 >= r2 * r3){
+			offset_count_2 = 0;
+			data_pos -= 1;
+		}
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	float predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	float mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t sample_count = 0;
+
+	offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+
+	while(data_pos - oriData < len){
+
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=maxRangeRadius)
+		{
+			radiusIndex = maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % sampleDistance;
+			data_pos += (r3 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}	
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+
+// 3D:  modified for higher performance
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size){
+
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif
+
+	float recip_realPrecision = 1/realPrecision;
+	//printf("recip_realPrecision = %.20G\n", recip_realPrecision);
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	//printf("num_x=%zu, num_y=%zu, num_z=%zu\n", num_x, num_y, num_z);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	memset(result_type, 0, num_elements*sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	size_t max_unpred_count = 0;
+	float * data_pos = oriData;
+	int * type = result_type;
+	size_t type_offset;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+	
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+				/*Calculate regression coefficients*/
+				{
+					float * cur_data_pos = data_pos;
+					float fx = 0.0;
+					float fy = 0.0;
+					float fz = 0.0;
+					float f = 0;
+					float sum_x, sum_y; 
+					float curData;
+					for(size_t i=0; i<current_blockcount_x; i++){
+						sum_x = 0;
+						for(size_t j=0; j<current_blockcount_y; j++){
+							sum_y = 0;
+							for(size_t k=0; k<current_blockcount_z; k++){
+								curData = *cur_data_pos;
+								// f += curData;
+								// fx += curData * i;
+								// fy += curData * j;
+								// fz += curData * k;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+							cur_data_pos += dim1_offset - current_blockcount_z;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+						cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+					}
+					float coeff = 1.0 / (current_blockcount_x * current_blockcount_y * current_blockcount_z);
+					reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (current_blockcount_z - 1) - f) * 6 * coeff / (current_blockcount_z + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2 + (current_blockcount_z - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	float precision_a, precision_b, precision_c, precision_d;
+	float rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision / late_blockcount_z;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		float sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabsf(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	float tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim_2 = r3 + 1;
+	size_t strip_dim0_offset = strip_dim_1 * strip_dim_2;
+	size_t strip_dim1_offset = strip_dim_2;
+	unsigned char * indicator_pos = indicator;
+
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+	float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	float * cur_pb_buf = prediction_buffer_1;
+	float * next_pb_buf = prediction_buffer_2;
+	float * cur_pb_buf_pos;
+	float * next_pb_buf_pos;
+	int intvCapacity = quantization_intervals;// exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	int use_reg = 0;
+	float noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	float * coeff_unpred_data[4];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+	float precision[4], recip_precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c, recip_precision[3] = 1/precision_d;
+	
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				float * pb_pos = cur_pb_buf_pos;
+				float * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+					size_t offset_z = 0;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						int block_size = MIN(current_blockcount_x, (MIN(current_blockcount_y, current_blockcount_z)));
+						for(int i=1; i<block_size; i++){
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+							err_reg += fabsf(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+							err_reg += fabsf(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+							err_reg += fabsf(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabsf(pred_sz - curData) + noise, fabsf(mean - curData));
+							err_reg += fabsf(pred_reg - curData);
+						}
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							float diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabsf(diff)*recip_precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabsf(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						float itvNum;
+						float diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif									
+									
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif									
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_pb_pos = pb_pos;
+						float * cur_data_pos = data_pos;
+						float curData;
+						float pred3D;
+						float itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabsf(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabsf(diff)*recip_realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabsf(curData - *cur_pb_pos)>realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																		
+									
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabsf(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabsf(diff)*recip_realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabsf(curData - *cur_pb_pos)>realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																		
+									
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				} // end k
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}// end j
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				// copy bottom plane from plane buffer
+				// memcpy(prediction_buffer, bottom_buffer + offset_y * strip_dim1_offset, (current_blockcount_y + 1) * strip_dim1_offset * sizeof(float));
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				float * pb_pos = cur_pb_buf_pos;
+				float * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+					size_t offset_z = 0;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif														
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						int block_size = MIN(current_blockcount_x, (MIN(current_blockcount_y, current_blockcount_z)));
+						for(int i=1; i<block_size; i++){
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabsf(pred_sz - curData) + noise;
+							err_reg += fabsf(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabsf(pred_sz - curData) + noise;
+							err_reg += fabsf(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabsf(pred_sz - curData) + noise;
+							err_reg += fabsf(pred_reg - curData);								
+
+							cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabsf(pred_sz - curData) + noise;
+							err_reg += fabsf(pred_reg - curData);
+						}
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							float diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabsf(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabsf(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						float itvNum;
+						float diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif																		
+
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - pred)>realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif																											
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_pb_pos = pb_pos;
+						float * cur_data_pos = data_pos;
+						float curData;
+						float pred3D;
+						float itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - *cur_pb_pos)>realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																											
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabsf(diff)*recip_realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabsf(curData - *cur_pb_pos)>realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																											
+									
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				}
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}
+	}
+
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(float) + sizeof(int) + sizeof(int) + 5*treeByteSize + 4*num_blocks*sizeof(int) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	floatToBytes(result_pos, realPrecision);
+	result_pos += sizeof(float);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+		
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	
+	//printf("reg_count = %d, num_blocks = %d\n", reg_count, num_blocks);
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			floatToBytes(result_pos, precision[e]);
+			result_pos += sizeof(float);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+#ifdef HAVE_WRITESTATS
+	writeHuffmanInfo(treeByteSize, typeArray_size, num_elements*sizeof(float), nodeCount);
+	writeBlockInfo(use_mean, block_size, reg_count, num_blocks);
+	writeUnpredictDataCounts(total_unpred, num_elements);
+#endif	
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	float * pred_buffer = (float *) malloc((block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	float * pred_buffer_pos = NULL;
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+				pred_buffer_pos = pred_buffer;
+				block_data_pos_x = data_pos;
+				// use the buffer as block_size*block_size*block_size
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						block_data_pos_z = block_data_pos_y;
+						for(int kk=0; kk<block_size; kk++){
+							*pred_buffer_pos = *block_data_pos_z;
+							if(k*block_size + kk + 1 < r3) block_data_pos_z ++;
+							pred_buffer_pos ++;
+						}
+						if(j*block_size + jj + 1 < r2) block_data_pos_y += dim1_offset;
+					}
+					if(i*block_size + ii + 1 < r1) block_data_pos_x += dim0_offset;
+				}
+				/*Calculate regression coefficients*/
+				{
+					float * cur_data_pos = pred_buffer;
+					float fx = 0.0;
+					float fy = 0.0;
+					float fz = 0.0;
+					float f = 0;
+					float sum_x, sum_y; 
+					float curData;
+					for(size_t i=0; i<block_size; i++){
+						sum_x = 0;
+						for(size_t j=0; j<block_size; j++){
+							sum_y = 0;
+							for(size_t k=0; k<block_size; k++){
+								curData = *cur_data_pos;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+					}
+					float coeff = 1.0 / (block_size * block_size * block_size);
+					reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((block_size - 1) * reg_params_pos[0] / 2 + (block_size - 1) * reg_params_pos[params_offset_b] / 2 + (block_size - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	float rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / block_size;
+	precision_b = rel_param_err * realPrecision / block_size;
+	precision_c = rel_param_err * realPrecision / block_size;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	unsigned char * indicator_pos = indicator;
+
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	int use_reg = 0;
+	float noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	float * coeff_unpred_data[4];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+	double precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	memset(pred_buffer, 0, (block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	int pred_buffer_block_size = block_size + 1;
+	int strip_dim0_offset = pred_buffer_block_size * pred_buffer_block_size;
+	int strip_dim1_offset = pred_buffer_block_size;
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i + 1;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+						}
+						
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										type[index] = 1;
+										*cur_data_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+												 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_data_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_data_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					reg_params_pos ++;
+					type += block_size * block_size * block_size;
+				} // end k
+				indicator_pos += num_z;
+			}// end j
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk < r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj < r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii < r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i + 1;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+						}
+						
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+											 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_data_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_data_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ					
+					reg_params_pos ++;
+					type += block_size * block_size * block_size;
+				}
+				indicator_pos += num_z;
+			}
+		}
+	}
+	free(pred_buffer);
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize + 4*num_blocks*sizeof(int)+ num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+// random access
+unsigned char * SZ_compress_float_1D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x;
+	size_t block_size = 256;
+	num_x = (r1 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size;
+	size_t num_blocks = num_x;
+	size_t num_elements = r1;
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	float * reg_params = (float *) malloc(num_blocks * 2 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	float * pred_buffer = (float *) malloc((block_size+1)*sizeof(float));
+	float * pred_buffer_pos = NULL;
+	float * block_data_pos_x = NULL;
+	for(size_t i=0; i<num_x; i++){
+		data_pos = oriData + i*block_size;
+		pred_buffer_pos = pred_buffer;
+		block_data_pos_x = data_pos;
+		// use the buffer as block_size
+		for(int ii=0; ii<block_size; ii++){
+			*pred_buffer_pos = *block_data_pos_x;
+			pred_buffer_pos ++;
+			if(i*block_size + ii + 1 < r1) block_data_pos_x ++;
+		}
+		/*Calculate regression coefficients*/
+		{
+			float * cur_data_pos = pred_buffer;
+			float fx = 0.0;
+			float f = 0;
+			float curData;
+			for(size_t i=0; i<block_size; i++){
+				curData = *cur_data_pos;
+				fx += curData * i;
+				f += curData;
+				cur_data_pos ++;
+			}
+			float coeff = 1.0 / block_size;
+			reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+			reg_params_pos[params_offset_b] = f * coeff - (block_size - 1) * reg_params_pos[0] / 2;
+		}
+		reg_params_pos ++;
+	}
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_1D_with_freq_and_dense_pos(oriData, r1, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	unsigned char * indicator_pos = indicator;
+
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	float noise = realPrecision * 0.5;
+	reg_params_pos = reg_params;
+
+	memset(pred_buffer, 0, (block_size+1)*sizeof(float));
+	// select
+	int sample_distance = sqrt(block_size) + 1;
+	if(use_mean){
+		for(size_t i=0; i<num_x; i++){
+			data_pos = oriData + i*block_size;
+			// add 1 in x, y offset
+			pred_buffer_pos = pred_buffer + 1;
+			block_data_pos_x = data_pos;
+			for(int ii=0; ii<block_size; ii++){
+				*pred_buffer_pos = *block_data_pos_x;
+				pred_buffer_pos ++;
+				if(i*block_size + ii + 1< r1) block_data_pos_x ++;
+			}
+			/*sampling and decide which predictor*/
+			{
+				float * cur_data_pos;
+				float curData;
+				float pred_reg, pred_sz;
+				float err_sz = 0.0, err_reg = 0.0;
+				for(int i=2; i<=block_size; i+=sample_distance){
+					cur_data_pos = pred_buffer + i;
+					curData = *cur_data_pos;
+					pred_sz = cur_data_pos[-1];
+					pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b];							
+					err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+					err_reg += fabs(pred_reg - curData);								
+				}
+				*indicator_pos = !(err_reg < err_sz);
+			}
+			reg_params_pos ++;
+			indicator_pos ++;
+		}// end i
+	}
+	else{
+		for(size_t i=0; i<num_x; i++){
+			data_pos = oriData + i*block_size;
+			// add 1 in x, y offset
+			pred_buffer_pos = pred_buffer + 1;
+			block_data_pos_x = data_pos;
+			for(int ii=0; ii<block_size; ii++){
+				*pred_buffer_pos = *block_data_pos_x;
+				pred_buffer_pos ++;
+				if(i*block_size + ii + 1< r1) block_data_pos_x ++;
+			}
+			/*sampling and decide which predictor*/
+			{
+				float * cur_data_pos;
+				float curData;
+				float pred_reg, pred_sz;
+				float err_sz = 0.0, err_reg = 0.0;
+				for(int i=2; i<=block_size; i+=sample_distance){
+					cur_data_pos = pred_buffer + i;
+					curData = *cur_data_pos;
+					pred_sz = cur_data_pos[-1];
+					pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b];							
+					err_sz += fabs(pred_sz - curData) + noise;
+					err_reg += fabs(pred_reg - curData);								
+				}
+				*indicator_pos = !(err_reg < err_sz);
+			}
+			reg_params_pos ++;
+			indicator_pos ++;
+		}// end i
+	}
+
+	size_t reg_count = 0;
+	for(int i=0; i<num_blocks; i++){
+		if(!(indicator[i])){
+			reg_params[reg_count] = reg_params[i];
+			reg_params[reg_count + params_offset_b] = reg_params[i + params_offset_b];
+			reg_count ++;
+		}
+	}
+	//Compress coefficient arrays
+	double precision_a, precision_b;
+	float rel_param_err = 0.1/2;
+	precision_a = rel_param_err * realPrecision / block_size;
+	precision_b = rel_param_err * realPrecision;
+	float last_coeffcients[2] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[2];
+	int * coeff_result_type = (int *) malloc(reg_count*2*sizeof(int));
+	float * coeff_unpred_data[2];
+	float * coeff_unpredictable_data = (float *) malloc(reg_count*2*sizeof(float));
+	double precision[2];
+	precision[0] = precision_a, precision[1] = precision_b;
+	for(int i=0; i<2; i++){
+		coeff_type[i] = coeff_result_type + i * reg_count;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * reg_count;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[2] = {0};
+
+	float * reg_params_separte[2];
+	for(int i=0; i<2; i++){
+		reg_params_separte[i] = reg_params + i * num_blocks;
+	}
+	for(size_t i=0; i<reg_count; i++){
+		// for each coeff
+		float cur_coeff;
+		double diff, itvNum;
+		for(int e=0; e<2; e++){
+			cur_coeff = reg_params_separte[e][i];
+			diff = cur_coeff - last_coeffcients[e];
+			itvNum = fabs(diff)/precision[e] + 1;
+			if (itvNum < coeff_intvCapacity_sz){
+				if (diff < 0) itvNum = -itvNum;
+				coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+				last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+				//ganrantee compression error against the case of machine-epsilon
+				if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+					coeff_type[e][coeff_index] = 0;
+					last_coeffcients[e] = cur_coeff;	
+					coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+				}					
+			}
+			else{
+				coeff_type[e][coeff_index] = 0;
+				last_coeffcients[e] = cur_coeff;
+				coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+			}
+			reg_params_separte[e][i] = last_coeffcients[e];
+		}
+		coeff_index ++;
+	}
+	// pred & quantization
+	int * blockwise_unpred_count = (int *) malloc(num_blocks * sizeof(int));
+	int * blockwise_unpred_count_pos = blockwise_unpred_count;
+	reg_params_pos = reg_params;
+	indicator_pos = indicator;
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			data_pos = oriData + i*block_size;
+			// add 1 in x, y offset
+			pred_buffer_pos = pred_buffer + 1;
+			block_data_pos_x = data_pos;
+			for(int ii=0; ii<block_size; ii++){
+				*pred_buffer_pos = *block_data_pos_x;
+				pred_buffer_pos ++;
+				if(i*block_size + ii + 1< r1) block_data_pos_x ++;
+			}
+			if(!(*indicator_pos)){
+				float curData;
+				float pred;
+				double itvNum;
+				double diff;
+				size_t index = 0;
+				size_t block_unpredictable_count = 0;
+				float * cur_data_pos = pred_buffer + 1;
+				for(size_t ii=0; ii<block_size; ii++){
+					curData = *cur_data_pos;
+					pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b];									
+					diff = curData - pred;
+					itvNum = fabs(diff)/tmp_realPrecision + 1;
+					if (itvNum < intvCapacity){
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + intvRadius;
+						pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+						//ganrantee comporession error against the case of machine-epsilon
+						if(fabs(curData - pred)>tmp_realPrecision){	
+							type[index] = 0;
+							pred = curData;
+							unpredictable_data[block_unpredictable_count ++] = curData;
+						}		
+					}
+					else{
+						type[index] = 0;
+						pred = curData;
+						unpredictable_data[block_unpredictable_count ++] = curData;
+					}
+					index ++;
+					cur_data_pos ++;
+				}
+				reg_params_pos ++;
+				total_unpred += block_unpredictable_count;
+				unpredictable_data += block_unpredictable_count;
+				*blockwise_unpred_count_pos = block_unpredictable_count;
+			}
+			else{
+				// use SZ
+				// SZ predication
+				unpredictable_count = 0;
+				float * cur_data_pos = pred_buffer + 1;
+				float curData;
+				float pred3D;
+				double itvNum, diff;
+				size_t index = 0;
+				for(size_t ii=0; ii<block_size; ii++){
+					curData = *cur_data_pos;
+					if(fabs(curData - mean) <= realPrecision){
+						type[index] = 1;
+						*cur_data_pos = mean;
+					}
+					else
+					{
+						pred3D = cur_data_pos[-1];
+						diff = curData - pred3D;
+						itvNum = fabs(diff)/realPrecision + 1;
+						if (itvNum < intvCapacity_sz){
+							if (diff < 0) itvNum = -itvNum;
+							type[index] = (int) (itvNum/2) + intvRadius;
+							*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+							//ganrantee comporession error against the case of machine-epsilon
+							if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+								type[index] = 0;
+								*cur_data_pos = curData;	
+								unpredictable_data[unpredictable_count ++] = curData;
+							}					
+						}
+						else{
+							type[index] = 0;
+							*cur_data_pos = curData;
+							unpredictable_data[unpredictable_count ++] = curData;
+						}
+					}
+					index ++;
+					cur_data_pos ++;
+				}
+				total_unpred += unpredictable_count;
+				unpredictable_data += unpredictable_count;
+				*blockwise_unpred_count_pos = unpredictable_count;
+			}// end SZ
+			blockwise_unpred_count_pos ++;
+			type += block_size;
+			indicator_pos ++;
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			data_pos = oriData + i*block_size;
+			// add 1 in x, y offset
+			pred_buffer_pos = pred_buffer + 1;
+			block_data_pos_x = data_pos;
+			for(int ii=0; ii<block_size; ii++){
+				*pred_buffer_pos = *block_data_pos_x;
+				pred_buffer_pos ++;
+				if(i*block_size + ii + 1< r1) block_data_pos_x ++;
+			}
+			if(!(*indicator_pos)){
+				float curData;
+				float pred;
+				double itvNum;
+				double diff;
+				size_t index = 0;
+				size_t block_unpredictable_count = 0;
+				float * cur_data_pos = pred_buffer + 1;
+				for(size_t ii=0; ii<block_size; ii++){
+					curData = *cur_data_pos;
+					pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b];									
+					diff = curData - pred;
+					itvNum = fabs(diff)/tmp_realPrecision + 1;
+					if (itvNum < intvCapacity){
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + intvRadius;
+						pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+						//ganrantee comporession error against the case of machine-epsilon
+						if(fabs(curData - pred)>tmp_realPrecision){	
+							type[index] = 0;
+							pred = curData;
+							unpredictable_data[block_unpredictable_count ++] = curData;
+						}		
+					}
+					else{
+						type[index] = 0;
+						pred = curData;
+						unpredictable_data[block_unpredictable_count ++] = curData;
+					}
+					index ++;
+					cur_data_pos ++;
+				}
+				reg_params_pos ++;
+				total_unpred += block_unpredictable_count;
+				unpredictable_data += block_unpredictable_count;
+				*blockwise_unpred_count_pos = block_unpredictable_count;
+			}
+			else{
+				// use SZ
+				// SZ predication
+				unpredictable_count = 0;
+				float * cur_data_pos = pred_buffer + 1;
+				float curData;
+				float pred3D;
+				double itvNum, diff;
+				size_t index = 0;
+				for(size_t ii=0; ii<block_size; ii++){
+					curData = *cur_data_pos;					
+					pred3D = cur_data_pos[-1];
+					diff = curData - pred3D;
+					itvNum = fabs(diff)/realPrecision + 1;
+					if (itvNum < intvCapacity_sz){
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + intvRadius;
+						*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+						//ganrantee comporession error against the case of machine-epsilon
+						if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+							type[index] = 0;
+							*cur_data_pos = curData;	
+							unpredictable_data[unpredictable_count ++] = curData;
+						}					
+					}
+					else{
+						type[index] = 0;
+						*cur_data_pos = curData;
+						unpredictable_data[unpredictable_count ++] = curData;
+					}
+					index ++;
+					cur_data_pos ++;
+				}
+				total_unpred += unpredictable_count;
+				unpredictable_data += unpredictable_count;
+				*blockwise_unpred_count_pos = unpredictable_count;
+			}// end SZ
+			blockwise_unpred_count_pos ++;
+			type += block_size;
+			indicator_pos ++;
+		}// end i
+	}	
+	free(pred_buffer);
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize +4*num_blocks*sizeof(int) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<2; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	// record blockwise unpred data
+	size_t compressed_blockwise_unpred_count_size;
+	unsigned char * compressed_bw_unpred_count = SZ_compress_args(SZ_INT32, blockwise_unpred_count, &compressed_blockwise_unpred_count_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_blockwise_unpred_count_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_bw_unpred_count, compressed_blockwise_unpred_count_size);
+	result_pos += compressed_blockwise_unpred_count_size;
+	free(blockwise_unpred_count);
+	free(compressed_bw_unpred_count);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+
+	free(reg_params);
+	free(indicator);
+	free(result_unpredictable_data);
+	// encode type array by block
+	type = result_type;
+	size_t total_type_array_size = 0;
+	unsigned char * type_array_buffer = (unsigned char *) malloc(num_blocks*max_num_block_elements*sizeof(int));
+	unsigned short * type_array_block_size = (unsigned short *) malloc(num_blocks*sizeof(unsigned short));
+	unsigned char * type_array_buffer_pos = type_array_buffer;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+
+	for(size_t i=0; i<num_x; i++){
+		size_t typeArray_size = 0;
+		encode(huffmanTree, type, max_num_block_elements, type_array_buffer_pos, &typeArray_size);
+		total_type_array_size += typeArray_size;
+		*type_array_block_size_pos = typeArray_size;
+		type_array_buffer_pos += typeArray_size;
+		type += max_num_block_elements;
+		type_array_block_size_pos ++;
+	}
+	size_t compressed_type_array_block_size;
+	unsigned char * compressed_type_array_block = SZ_compress_args(SZ_UINT16, type_array_block_size, &compressed_type_array_block_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_type_array_block_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_type_array_block, compressed_type_array_block_size);
+	result_pos += compressed_type_array_block_size;
+	memcpy(result_pos, type_array_buffer, total_type_array_size);
+	result_pos += total_type_array_size;
+	// size_t typeArray_size = 0;
+	// encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+	// result_pos += typeArray_size;
+
+	free(compressed_type_array_block);
+	free(type_array_buffer);
+	free(type_array_block_size);
+	size_t totalEncodeSize = result_pos - result;
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+unsigned char * SZ_compress_float_2D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y;
+	size_t block_size = 16;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size;
+	size_t num_blocks = num_x * num_y;
+	size_t num_elements = r1 * r2;
+	size_t dim0_offset = r2;
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	float * reg_params = (float *) malloc(num_blocks * 3 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	float * pred_buffer = (float *) malloc((block_size+1)*(block_size+1)*sizeof(float));
+	float * pred_buffer_pos = NULL;
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			data_pos = oriData + i*block_size * dim0_offset + j*block_size;
+			pred_buffer_pos = pred_buffer;
+			block_data_pos_x = data_pos;
+			// use the buffer as block_size*block_size
+			for(int ii=0; ii<block_size; ii++){
+				block_data_pos_y = block_data_pos_x;
+				for(int jj=0; jj<block_size; jj++){
+					*pred_buffer_pos = *block_data_pos_y;
+					if(j*block_size + jj + 1< r2) block_data_pos_y ++;
+					pred_buffer_pos ++;
+				}
+				if(i*block_size + ii + 1 < r1) block_data_pos_x += dim0_offset;
+			}
+			/*Calculate regression coefficients*/
+			{
+				float * cur_data_pos = pred_buffer;
+				float fx = 0.0;
+				float fy = 0.0;
+				float f = 0;
+				float sum_x; 
+				float curData;
+				for(size_t i=0; i<block_size; i++){
+					sum_x = 0;
+					for(size_t j=0; j<block_size; j++){
+						curData = *cur_data_pos;
+						sum_x += curData;
+						fy += curData * j;
+						cur_data_pos ++;
+					}
+					fx += sum_x * i;
+					f += sum_x;
+				}
+				float coeff = 1.0 / (block_size * block_size);
+				reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+				reg_params_pos[params_offset_b] = (2 * fy / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+				reg_params_pos[params_offset_c] = f * coeff - ((block_size - 1) * reg_params_pos[0] / 2 + (block_size - 1) * reg_params_pos[params_offset_b] / 2);
+			}
+			reg_params_pos ++;
+		}
+	}
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	unsigned char * indicator_pos = indicator;
+
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	float noise = realPrecision * 0.81;
+	reg_params_pos = reg_params;
+
+	memset(pred_buffer, 0, (block_size+1)*(block_size+1)*sizeof(float));
+	int pred_buffer_block_size = block_size + 1;
+	int strip_dim0_offset = pred_buffer_block_size;
+
+	// select
+	if(use_mean){
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size;
+				// add 1 in x, y offset
+				pred_buffer_pos = pred_buffer + pred_buffer_block_size + 1;
+				block_data_pos_x = data_pos;
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						*pred_buffer_pos = *block_data_pos_y;
+						if(j*block_size + jj + 1< r2) block_data_pos_y ++;
+						pred_buffer_pos ++;
+					}
+					// add 1 in y offset
+					pred_buffer_pos ++;
+					if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+				}
+				/*sampling and decide which predictor*/
+				{
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					int bmi = 0;
+					for(int i=2; i<=block_size; i++){
+						cur_data_pos = pred_buffer + i*pred_buffer_block_size + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+						err_reg += fabs(pred_reg - curData);
+
+						bmi = block_size - i + 1;
+						cur_data_pos = pred_buffer + i*pred_buffer_block_size + (bmi+1);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+						err_reg += fabs(pred_reg - curData);								
+					}
+					*indicator_pos = !(err_reg < err_sz);
+				}
+				reg_params_pos ++;
+				indicator_pos ++;
+			}// end j
+		}// end i
+	}
+	else{
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size;
+				// add 1 in x, y offset
+				pred_buffer_pos = pred_buffer + pred_buffer_block_size + 1;
+				block_data_pos_x = data_pos;
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						*pred_buffer_pos = *block_data_pos_y;
+						if(j*block_size + jj + 1< r2) block_data_pos_y ++;
+						pred_buffer_pos ++;
+					}
+					// add 1 in y offset
+					pred_buffer_pos ++;
+					if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+				}
+				/*sampling and decide which predictor*/
+				{
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					int bmi = 0;
+					for(int i=2; i<=block_size; i++){
+						cur_data_pos = pred_buffer + i*pred_buffer_block_size + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c];							
+						err_sz += fabs(pred_sz - curData) + noise;
+						err_reg += fabs(pred_reg - curData);
+
+						bmi = block_size - i + 1;
+						cur_data_pos = pred_buffer + i*pred_buffer_block_size + (bmi+1);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c];							
+						err_sz += fabs(pred_sz - curData) + noise;
+						err_reg += fabs(pred_reg - curData);								
+					}
+					*indicator_pos = !(err_reg < err_sz);
+				}
+				reg_params_pos ++;
+				indicator_pos ++;
+			}// end j
+		}// end i
+	}
+
+	size_t reg_count = 0;
+	for(int i=0; i<num_blocks; i++){
+		if(!(indicator[i])){
+			reg_params[reg_count] = reg_params[i];
+			reg_params[reg_count + params_offset_b] = reg_params[i + params_offset_b];
+			reg_params[reg_count + params_offset_c] = reg_params[i + params_offset_c];
+			reg_count ++;
+		}
+	}
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c;
+	float rel_param_err = 0.15/3;
+	precision_a = rel_param_err * realPrecision / block_size;
+	precision_b = rel_param_err * realPrecision / block_size;
+	precision_c = rel_param_err * realPrecision;
+	float last_coeffcients[3] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[3];
+	int * coeff_result_type = (int *) malloc(reg_count*3*sizeof(int));
+	float * coeff_unpred_data[3];
+	float * coeff_unpredictable_data = (float *) malloc(reg_count*3*sizeof(float));
+	double precision[3];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	for(int i=0; i<3; i++){
+		coeff_type[i] = coeff_result_type + i * reg_count;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * reg_count;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[3] = {0};
+
+	float * reg_params_separte[3];
+	for(int i=0; i<3; i++){
+		reg_params_separte[i] = reg_params + i * num_blocks;
+	}
+	for(size_t i=0; i<reg_count; i++){
+		// for each coeff
+		float cur_coeff;
+		double diff, itvNum;
+		for(int e=0; e<3; e++){
+			cur_coeff = reg_params_separte[e][i];
+			diff = cur_coeff - last_coeffcients[e];
+			itvNum = fabs(diff)/precision[e] + 1;
+			if (itvNum < coeff_intvCapacity_sz){
+				if (diff < 0) itvNum = -itvNum;
+				coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+				last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+				//ganrantee compression error against the case of machine-epsilon
+				if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+					coeff_type[e][coeff_index] = 0;
+					last_coeffcients[e] = cur_coeff;	
+					coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+				}					
+			}
+			else{
+				coeff_type[e][coeff_index] = 0;
+				last_coeffcients[e] = cur_coeff;
+				coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+			}
+			reg_params_separte[e][i] = last_coeffcients[e];
+		}
+		coeff_index ++;
+	}
+	// pred & quantization
+	int * blockwise_unpred_count = (int *) malloc(num_blocks * sizeof(int));
+	int * blockwise_unpred_count_pos = blockwise_unpred_count;
+	reg_params_pos = reg_params;
+	indicator_pos = indicator;
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size;
+				// add 1 in x, y offset
+				pred_buffer_pos = pred_buffer + pred_buffer_block_size + 1;
+				block_data_pos_x = data_pos;
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						*pred_buffer_pos = *block_data_pos_y;
+						if(j*block_size + jj + 1< r2) block_data_pos_y ++;
+						pred_buffer_pos ++;
+					}
+					// add 1 in y offset
+					pred_buffer_pos ++;
+					if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+				}
+				if(!(*indicator_pos)){
+					float curData;
+					float pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = pred_buffer + pred_buffer_block_size + 1;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							curData = *cur_data_pos;
+							pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b] * jj + reg_params_pos[params_offset_c];									
+							diff = curData - pred;
+							itvNum = fabs(diff)/tmp_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>tmp_realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos ++;
+					}
+					reg_params_pos ++;
+					total_unpred += block_unpredictable_count;
+					unpredictable_data += block_unpredictable_count;
+					*blockwise_unpred_count_pos = block_unpredictable_count;
+				}
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_data_pos = pred_buffer + pred_buffer_block_size + 1;
+					float curData;
+					float pred3D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								type[index] = 1;
+								*cur_data_pos = mean;
+							}
+							else
+							{
+								pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+								diff = curData - pred3D;
+								itvNum = fabs(diff)/realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_data_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_data_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							index ++;
+							cur_data_pos ++;
+						}
+						cur_data_pos ++;
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					*blockwise_unpred_count_pos = unpredictable_count;
+				}// end SZ
+				blockwise_unpred_count_pos ++;
+				type += block_size * block_size;
+				indicator_pos ++;
+			}// end j
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size;
+				// add 1 in x, y offset
+				pred_buffer_pos = pred_buffer + pred_buffer_block_size + 1;
+				block_data_pos_x = data_pos;
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						*pred_buffer_pos = *block_data_pos_y;
+						if(j*block_size + jj + 1< r2) block_data_pos_y ++;
+						pred_buffer_pos ++;
+					}
+					// add 1 in y offset
+					pred_buffer_pos ++;
+					if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+				}
+				if(!(*indicator_pos)){
+					float curData;
+					float pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = pred_buffer + pred_buffer_block_size + 1;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							curData = *cur_data_pos;
+							pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b] * jj + reg_params_pos[params_offset_c];									
+							diff = curData - pred;
+							itvNum = fabs(diff)/tmp_realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>tmp_realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos ++;
+					}
+					reg_params_pos ++;
+					total_unpred += block_unpredictable_count;
+					unpredictable_data += block_unpredictable_count;
+					*blockwise_unpred_count_pos = block_unpredictable_count;
+				}
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_data_pos = pred_buffer + pred_buffer_block_size + 1;
+					float curData;
+					float pred3D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							curData = *cur_data_pos;
+							pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim0_offset - 1];
+							diff = curData - pred3D;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_data_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_data_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+							index ++;
+							cur_data_pos ++;
+						}
+						cur_data_pos ++;
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					*blockwise_unpred_count_pos = unpredictable_count;
+				}// end SZ
+				blockwise_unpred_count_pos ++;
+				type += block_size * block_size;
+				indicator_pos ++;
+			}// end j
+		}// end i
+	}	
+
+	free(pred_buffer);
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize + 4*num_blocks*sizeof(int) +num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<3; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	// record blockwise unpred data
+	size_t compressed_blockwise_unpred_count_size;
+	unsigned char * compressed_bw_unpred_count = SZ_compress_args(SZ_INT32, blockwise_unpred_count, &compressed_blockwise_unpred_count_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_blockwise_unpred_count_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_bw_unpred_count, compressed_blockwise_unpred_count_size);
+	result_pos += compressed_blockwise_unpred_count_size;
+	free(blockwise_unpred_count);
+	free(compressed_bw_unpred_count);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+
+	free(reg_params);
+	free(indicator);
+	free(result_unpredictable_data);
+	// encode type array by block
+	type = result_type;
+	size_t total_type_array_size = 0;
+	unsigned char * type_array_buffer = (unsigned char *) malloc(num_blocks*max_num_block_elements*sizeof(int));
+	unsigned short * type_array_block_size = (unsigned short *) malloc(num_blocks*sizeof(unsigned short));
+	unsigned char * type_array_buffer_pos = type_array_buffer;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			size_t typeArray_size = 0;
+			encode(huffmanTree, type, max_num_block_elements, type_array_buffer_pos, &typeArray_size);
+			total_type_array_size += typeArray_size;
+			*type_array_block_size_pos = typeArray_size;
+			type_array_buffer_pos += typeArray_size;
+			type += max_num_block_elements;
+			type_array_block_size_pos ++;
+		}
+	}
+	size_t compressed_type_array_block_size;
+	unsigned char * compressed_type_array_block = SZ_compress_args(SZ_UINT16, type_array_block_size, &compressed_type_array_block_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_type_array_block_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_type_array_block, compressed_type_array_block_size);
+	result_pos += compressed_type_array_block_size;
+	memcpy(result_pos, type_array_buffer, total_type_array_size);
+	result_pos += total_type_array_size;
+	// size_t typeArray_size = 0;
+	// encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+	// result_pos += typeArray_size;
+
+	free(compressed_type_array_block);
+	free(type_array_buffer);
+	free(type_array_block_size);
+	size_t totalEncodeSize = result_pos - result;
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+unsigned char * SZ_compress_float_3D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	float * pred_buffer = (float *) malloc((block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	float * pred_buffer_pos = NULL;
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+				pred_buffer_pos = pred_buffer;
+				block_data_pos_x = data_pos;
+				// use the buffer as block_size*block_size*block_size
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						block_data_pos_z = block_data_pos_y;
+						for(int kk=0; kk<block_size; kk++){
+							*pred_buffer_pos = *block_data_pos_z;
+							if(k*block_size + kk + 1 < r3) block_data_pos_z ++;
+							pred_buffer_pos ++;
+						}
+						if(j*block_size + jj + 1 < r2) block_data_pos_y += dim1_offset;
+					}
+					if(i*block_size + ii + 1 < r1) block_data_pos_x += dim0_offset;
+				}
+				/*Calculate regression coefficients*/
+				{
+					float * cur_data_pos = pred_buffer;
+					float fx = 0.0;
+					float fy = 0.0;
+					float fz = 0.0;
+					float f = 0;
+					float sum_x, sum_y; 
+					float curData;
+					for(size_t i=0; i<block_size; i++){
+						sum_x = 0;
+						for(size_t j=0; j<block_size; j++){
+							sum_y = 0;
+							for(size_t k=0; k<block_size; k++){
+								curData = *cur_data_pos;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+					}
+					float coeff = 1.0 / (block_size * block_size * block_size);
+					reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((block_size - 1) * reg_params_pos[0] / 2 + (block_size - 1) * reg_params_pos[params_offset_b] / 2 + (block_size - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	unsigned char * indicator_pos = indicator;
+
+	int intvCapacity = quantization_intervals; //exe_params->intvCapacity;
+	int intvRadius = intvCapacity/2; //exe_params->intvRadius;	
+	float noise = realPrecision * 1.22;
+	reg_params_pos = reg_params;
+
+	memset(pred_buffer, 0, (block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	int pred_buffer_block_size = block_size + 1;
+	int strip_dim0_offset = pred_buffer_block_size * pred_buffer_block_size;
+	int strip_dim1_offset = pred_buffer_block_size;
+
+	// select
+	if(use_mean){
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i + 1;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + (bmi+1);
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + (bmi+1)*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + (bmi+1)*pred_buffer_block_size + (bmi+1);
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+						}
+						// indicator_pos[k] = (err_sz < err_reg);
+						indicator_pos[k] = !(err_reg < err_sz);
+					}
+					reg_params_pos ++;
+				} // end k
+				indicator_pos += num_z;
+			}// end j
+		}// end i
+	}
+	else{
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii +1 < r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i + 1;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + (bmi+1);
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * (i-1) + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + (bmi+1)*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * (i-1) + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + (bmi+1)*pred_buffer_block_size + (bmi+1);
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * (i-1) + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+						}
+						// indicator_pos[k] = (err_sz < err_reg);
+						indicator_pos[k] = !(err_reg < err_sz);
+					}
+					reg_params_pos ++;
+				}
+				indicator_pos += num_z;
+			}
+		}
+	}
+
+	size_t reg_count = 0;
+	for(int i=0; i<num_blocks; i++){
+		if(!(indicator[i])){
+			reg_params[reg_count] = reg_params[i];
+			reg_params[reg_count + params_offset_b] = reg_params[i + params_offset_b];
+			reg_params[reg_count + params_offset_c] = reg_params[i + params_offset_c];
+			reg_params[reg_count + params_offset_d] = reg_params[i + params_offset_d];
+			reg_count ++;
+		}
+	}
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	float rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / block_size;
+	precision_b = rel_param_err * realPrecision / block_size;
+	precision_c = rel_param_err * realPrecision / block_size;
+	precision_d = rel_param_err * realPrecision;
+	float last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(reg_count*4*sizeof(int));
+	float * coeff_unpred_data[4];
+	float * coeff_unpredictable_data = (float *) malloc(reg_count*4*sizeof(float));
+	double precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * reg_count;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * reg_count;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	float * reg_params_separte[4];
+	for(int i=0; i<4; i++){
+		reg_params_separte[i] = reg_params + i * num_blocks;
+	}
+	for(size_t i=0; i<reg_count; i++){
+		// for each coeff
+		float cur_coeff;
+		double diff, itvNum;
+		for(int e=0; e<4; e++){
+			cur_coeff = reg_params_separte[e][i];
+			diff = cur_coeff - last_coeffcients[e];
+			itvNum = fabs(diff)/precision[e] + 1;
+			if (itvNum < coeff_intvCapacity_sz){
+				if (diff < 0) itvNum = -itvNum;
+				coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+				last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+				//ganrantee compression error against the case of machine-epsilon
+				if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+					coeff_type[e][coeff_index] = 0;
+					last_coeffcients[e] = cur_coeff;	
+					coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+				}					
+			}
+			else{
+				coeff_type[e][coeff_index] = 0;
+				last_coeffcients[e] = cur_coeff;
+				coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+			}
+			reg_params_separte[e][i] = last_coeffcients[e];
+		}
+		coeff_index ++;
+	}
+	// pred & quantization
+	int * blockwise_unpred_count = (int *) malloc(num_blocks * sizeof(int));
+	int * blockwise_unpred_count_pos = blockwise_unpred_count;
+	reg_params_pos = reg_params;
+	indicator_pos = indicator;
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+					}
+					if(!(indicator_pos[k])){
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b] * jj + reg_params_pos[params_offset_c] * kk + reg_params_pos[params_offset_d];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						reg_params_pos ++;
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;
+						*blockwise_unpred_count_pos = block_unpredictable_count;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										type[index] = 1;
+										*cur_data_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+												 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_data_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_data_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						*blockwise_unpred_count_pos = unpredictable_count;
+					}// end SZ
+					blockwise_unpred_count_pos ++;
+					type += block_size * block_size * block_size;
+				} // end k
+				indicator_pos += num_z;
+			}// end j
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk +1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj +1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii +1< r1) block_data_pos_x += dim0_offset;
+					}
+					if(!(indicator_pos[k]))
+					{
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = reg_params_pos[0] * ii + reg_params_pos[params_offset_b] * jj + reg_params_pos[params_offset_c] * kk + reg_params_pos[params_offset_d];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						reg_params_pos ++;
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;						
+						*blockwise_unpred_count_pos = block_unpredictable_count;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+											 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_data_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_data_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						*blockwise_unpred_count_pos = unpredictable_count;
+					}// end SZ	
+					blockwise_unpred_count_pos ++;
+					type += block_size * block_size * block_size;
+				}
+				indicator_pos += num_z;
+			}
+		}
+	}	
+
+	free(pred_buffer);
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + 5*treeByteSize + 4*num_blocks*sizeof(int)+num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	// record blockwise unpred data
+	size_t compressed_blockwise_unpred_count_size;
+	unsigned char * compressed_bw_unpred_count = SZ_compress_args(SZ_INT32, blockwise_unpred_count, &compressed_blockwise_unpred_count_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_blockwise_unpred_count_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_bw_unpred_count, compressed_blockwise_unpred_count_size);
+	result_pos += compressed_blockwise_unpred_count_size;
+	free(blockwise_unpred_count);
+	free(compressed_bw_unpred_count);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+
+	free(reg_params);
+	free(indicator);
+	free(result_unpredictable_data);
+	// encode type array by block
+	type = result_type;
+	size_t total_type_array_size = 0;
+	unsigned char * type_array_buffer = (unsigned char *) malloc(num_blocks*max_num_block_elements*sizeof(int));
+	unsigned short * type_array_block_size = (unsigned short *) malloc(num_blocks*sizeof(unsigned short));
+	unsigned char * type_array_buffer_pos = type_array_buffer;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){	
+				size_t typeArray_size = 0;
+				encode(huffmanTree, type, max_num_block_elements, type_array_buffer_pos, &typeArray_size);
+				total_type_array_size += typeArray_size;
+				*type_array_block_size_pos = typeArray_size;
+				type_array_buffer_pos += typeArray_size;
+				type += max_num_block_elements;
+				type_array_block_size_pos ++;
+			}
+		}
+	}
+	size_t compressed_type_array_block_size;
+	unsigned char * compressed_type_array_block = SZ_compress_args(SZ_UINT16, type_array_block_size, &compressed_type_array_block_size, ABS, 0.5, 0, 0, 0, 0, 0, 0, num_blocks);
+	memcpy(result_pos, &compressed_type_array_block_size, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, compressed_type_array_block, compressed_type_array_block_size);
+	result_pos += compressed_type_array_block_size;
+	memcpy(result_pos, type_array_buffer, total_type_array_size);
+	result_pos += total_type_array_size;
+	// size_t typeArray_size = 0;
+	// encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+	// result_pos += typeArray_size;
+
+	free(compressed_type_array_block);
+	free(type_array_buffer);
+	free(type_array_block_size);
+	size_t totalEncodeSize = result_pos - result;
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
diff --git a/deps/SZ/sz/src/sz_float_pwr.c b/deps/SZ/sz/src/sz_float_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ff1e61e4f61c30cc9f5afdbdb7e3e7a992f66ca
--- /dev/null
+++ b/deps/SZ/sz/src/sz_float_pwr.c
@@ -0,0 +1,2079 @@
+/**
+ *  @file sz_float_pwr.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ * This file contains the compression/decompression functions related to point-wise relative errors
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "sz_float.h"
+#include "sz_float_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "utility.h"
+
+void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0;
+	float realPrecision = oriData[0]!=0?fabs(confparams_cpr->pw_relBoundRatio*oriData[0]):confparams_cpr->pw_relBoundRatio; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curPrecision;
+	float curValue;
+	float sum = 0;
+	for(i=0;i<dataLength;i++)
+	{
+		curValue = oriData[i];
+		if(i%confparams_cpr->segment_size==0&&i>0)
+		{
+			//get two first bytes of the realPrecision
+			if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+			{
+				realPrecision = sum/confparams_cpr->segment_size;
+				sum = 0;			
+			}
+			realPrecision *= confparams_cpr->pw_relBoundRatio;
+			
+			if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+				realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+			else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+				realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+				
+			floatToBytes(realPrecBytes, realPrecision);
+			realPrecBytes[2] = realPrecBytes[3] = 0;
+			approxPrecision = bytesToFloat(realPrecBytes);
+			//put the realPrecision in float* pwrErBound
+			pwrErrBound[j++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			pwrErrBoundBytes[k++] = realPrecBytes[0];
+			pwrErrBoundBytes[k++] = realPrecBytes[1];
+			
+			realPrecision = fabs(curValue);
+		}
+		
+		if(curValue!=0)
+		{
+			curPrecision = fabs(curValue);
+			
+			switch(confparams_cpr->pwr_type)
+			{
+			case SZ_PWR_MIN_TYPE: 
+				if(realPrecision>curPrecision)
+					realPrecision = curPrecision;	
+				break;
+			case SZ_PWR_AVG_TYPE:
+				sum += curPrecision;
+				break;
+			case SZ_PWR_MAX_TYPE:
+				if(realPrecision<curPrecision)
+					realPrecision = curPrecision;					
+				break;
+			}
+		}
+	}
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int size = dataLength%confparams_cpr->segment_size==0?confparams_cpr->segment_size:dataLength%confparams_cpr->segment_size;
+		realPrecision = sum/size;		
+	}	
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;	
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[j++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];
+}
+
+unsigned int optimize_intervals_float_1D_pwr(float *oriData, size_t dataLength, float* pwrErrBound)
+{	
+	size_t i = 0, j = 0;
+	float realPrecision = pwrErrBound[j++];	
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	int totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->segment_size==0)
+			realPrecision = pwrErrBound[j++];
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_float_2D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, index = 0, J = 0; //I=-1,J=-1 if they are needed
+	float realPrecision; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curValue, curAbsValue;
+	float* statAbsValues = (float*)malloc(R2*sizeof(float));
+	
+	float max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.
+	float min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+	{
+		if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+			statAbsValues[i] = max;
+		else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+			statAbsValues[i] = min;
+		else
+			statAbsValues[i] = 0; //for SZ_PWR_AVG_TYPE
+	}
+	for(i=0;i<r1;i++)
+	{
+		for(j=0;j<r2;j++)
+		{
+			index = i*r2+j;
+			curValue = oriData[index];				
+			if(((i%edgeSize==edgeSize-1 || i==r1-1) &&j%edgeSize==0&&j>0) || (i%edgeSize==0&&j==0&&i>0))
+			{
+				if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+				{
+					int a = edgeSize, b = edgeSize;
+					if(j==0)
+					{
+						if(r2%edgeSize==0) 
+							b = edgeSize;
+						else
+							b = r2%edgeSize;
+					}
+					if(i==r1-1)
+					{
+						if(r1%edgeSize==0)
+							a = edgeSize;
+						else
+							a = r1%edgeSize;
+					}
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+				}
+				else
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];
+
+				if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+					realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+				else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+					realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+					
+				floatToBytes(realPrecBytes, realPrecision);
+				realPrecBytes[2] = realPrecBytes[3] = 0;
+				approxPrecision = bytesToFloat(realPrecBytes);
+				//put the realPrecision in float* pwrErBound		
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				pwrErrBoundBytes[k++] = realPrecBytes[0];
+				pwrErrBoundBytes[k++] = realPrecBytes[1];	
+				
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J] = min;
+				else
+					statAbsValues[J] = 0; //for SZ_PWR_AVG_TYPE	
+			}	
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;			
+			if(curValue!=0)
+			{
+				curAbsValue = fabs(curValue);
+				
+				switch(confparams_cpr->pwr_type)
+				{
+				case SZ_PWR_MIN_TYPE: 
+					if(statAbsValues[J]>curAbsValue)
+						statAbsValues[J] = curAbsValue;	
+					break;
+				case SZ_PWR_AVG_TYPE:
+					statAbsValues[J] += curAbsValue;
+					break;
+				case SZ_PWR_MAX_TYPE:
+					if(statAbsValues[J]<curAbsValue)
+						statAbsValues[J] = curAbsValue;					
+					break;
+				}
+			}
+		}
+	}
+		
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int a = edgeSize, b = edgeSize;
+		if(r2%edgeSize==0) 
+			b = edgeSize;
+		else
+			b = r2%edgeSize;
+		if(r1%edgeSize==0)
+			a = edgeSize;
+		else
+			a = r1%edgeSize;
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+	}
+	else
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];		
+
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+		
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];	
+	
+	free(statAbsValues);
+}
+
+unsigned int optimize_intervals_float_2D_pwr(float *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, float* pwrErrBound)
+{	
+	size_t i = 0,j = 0, index, I=0, J=0;
+	float realPrecision = pwrErrBound[0];	
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	size_t ir2;
+	for(i=1;i<r1;i++)
+	{
+		ir2 = i*r2;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			index = ir2+j;
+			if(j%edgeSize==0)
+				J++;
+				
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				realPrecision = pwrErrBound[I*R2+J];
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_float_3D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, q = 0, index = 0, J = 0, K = 0; //I=-1,J=-1 if they are needed
+	size_t r23 = r2*r3, ir, jr;
+	float realPrecision; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curValue, curAbsValue;
+	
+	float** statAbsValues = create2DArray_float(R2, R3);
+	float max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.	
+	float min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	
+	for(i=0;i<R2;i++)
+		for(j=0;j<R3;j++)
+		{
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[i][j] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[i][j] = min;
+			else
+				statAbsValues[i][j] = 0;
+		}
+	for(i=0;i<r1;i++)
+	{
+		ir = i*r23;		
+		if(i%edgeSize==0&&i>0)
+		{
+			realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+			floatToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 2);
+			approxPrecision = bytesToFloat(realPrecBytes);
+			//put the realPrecision in float* pwrErBound
+			pwrErrBound[p++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+			pwrErrBoundBytes[q++] = realPrecBytes[0];
+			pwrErrBoundBytes[q++] = realPrecBytes[1];
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[J][K] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[J][K] = min;
+			
+		}		
+		for(j=0;j<r2;j++)
+		{
+			jr = j*r3;
+			if((i%edgeSize==edgeSize-1 || i == r1-1)&&j%edgeSize==0&&j>0)
+			{
+				realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+				floatToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 2);
+				approxPrecision = bytesToFloat(realPrecBytes);
+				//put the realPrecision in float* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+				pwrErrBoundBytes[q++] = realPrecBytes[0];
+				pwrErrBoundBytes[q++] = realPrecBytes[1];
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J][K] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J][K] = min;			
+			}
+			
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;					
+			
+			for(k=0;k<r3;k++)
+			{
+				index = ir+jr+k;				
+				curValue = oriData[index];				
+				if((i%edgeSize==edgeSize-1 || i == r1-1)&&(j%edgeSize==edgeSize-1||j==r2-1)&&k%edgeSize==0&&k>0)
+				{
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+					floatToBytes(realPrecBytes, realPrecision);
+					memset(&realPrecBytes[2], 0, 2);
+					approxPrecision = bytesToFloat(realPrecBytes);
+					//put the realPrecision in float* pwrErBound
+					pwrErrBound[p++] = approxPrecision;
+					//put the two bytes in pwrErrBoundBytes
+					//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+					pwrErrBoundBytes[q++] = realPrecBytes[0];
+					pwrErrBoundBytes[q++] = realPrecBytes[1];
+					
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+						statAbsValues[J][K] = max;
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+						statAbsValues[J][K] = min;	
+				}	
+
+				if(k==0)
+					K = 0;
+				else if(k%edgeSize==0)
+					K++;
+					
+				if(curValue!=0)
+				{
+					curAbsValue = fabs(curValue);
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					{
+						if(statAbsValues[J][K]>curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					{
+						if(statAbsValues[J][K]<curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+				}
+			}			
+		}
+	}	
+	
+	realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[q++] = realPrecBytes[0];
+	pwrErrBoundBytes[q++] = realPrecBytes[1];
+	
+	free2DArray_float(statAbsValues, R2);
+}
+
+unsigned int optimize_intervals_float_3D_pwr(float *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, float* pwrErrBound)
+{	
+	size_t i,j,k, ir,jr,index, I = 0,J=0,K=0;
+	float realPrecision = pwrErrBound[0];		
+	unsigned long radiusIndex;
+	size_t r23=r2*r3;
+	size_t R23 = R2*R3;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		ir = i*r23;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			jr = j*r3;
+			if(j%edgeSize==0)
+			{	
+				J++;
+				K = 0;
+			}			
+			for(k=1;k<r3;k++)
+			{
+				index = ir+jr+k;
+				if(k%edgeSize==0)
+					K++;		
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					realPrecision = pwrErrBound[I*R23+J*R2+K];					
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, 
+size_t dataLength, size_t *outSize, float min, float max)
+{
+	size_t pwrLength = dataLength%confparams_cpr->segment_size==0?dataLength/confparams_cpr->segment_size:dataLength/confparams_cpr->segment_size+1;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*pwrLength);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*pwrLength*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_float_1D(oriData, dataLength, pwrErrBound, pwrErrBoundBytes, globalPrecision);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_1D_pwr(oriData, dataLength, pwrErrBound);	
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i = 0, j = 0;
+	int reqLength;
+	float realPrecision = pwrErrBound[j++];	
+	float medianValue = 0;
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_float(radius);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[4] = {0};
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+						
+	//add the first data	
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+		
+	//add the second data
+	type[1] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);			
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius;
+	float curData;
+	float pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	int updateReqLength = 0; //a marker: 1 means already updated
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		if(i%confparams_cpr->segment_size==0)
+		{
+			realPrecision = pwrErrBound[j++];
+			checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+			interval = 2*realPrecision;
+			updateReqLength = 0;
+		}
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_float(last3CmprsData, pred);			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		if(updateReqLength==0)
+		{
+			computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;		
+		}
+		
+		type[i] = 0;
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_float(last3CmprsData, vce->data);	
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+//	writeUShortData(type, dataLength, "compressStateBytes.sb");
+//	unsigned short type_[dataLength];
+//	SZ_Reset();
+//	decode_withTree(tdps->typeArray, tdps->typeArray_size, type_);	
+//	printf("tdps->typeArray_size=%d\n", tdps->typeArray_size);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	int floatSize=sizeof(float);
+	if(*outSize>dataLength*floatSize)
+	{
+		size_t k = 0, i;
+		tdps->isLossless = 1;
+		size_t totalByteLength = 3 + exe_params->SZ_SIZE_TYPE + 1 + floatSize*dataLength;
+		*newByteData = (unsigned char*)malloc(totalByteLength);
+		
+		unsigned char dsLengthBytes[exe_params->SZ_SIZE_TYPE];
+		intToBytes_bigEndian(dsLengthBytes, dataLength);//4
+		for (i = 0; i < 3; i++)//3
+			(*newByteData)[k++] = versionNumber[i];
+		
+		if(exe_params->SZ_SIZE_TYPE==4)
+		{
+			(*newByteData)[k++] = 16;	//=00010000	
+		}
+		else 
+		{
+			(*newByteData)[k++] = 80;
+		}
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//4 or 8
+			(*newByteData)[k++] = dsLengthBytes[i];
+
+		
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+			memcpy((*newByteData)+4+exe_params->SZ_SIZE_TYPE, oriData, dataLength*floatSize);
+		else
+		{
+			unsigned char* p = (*newByteData)+4+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=floatSize)
+				floatToBytes(p, oriData[i]);
+		}
+		*outSize = totalByteLength;
+	}
+
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t *outSize, float min, float max)
+{
+	size_t dataLength=r1*r2;
+	int blockEdgeSize = computeBlockEdgeSize_2D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*R1*R2);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_float_2D(oriData, pwrErrBound, r1, r2, R2, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);
+		
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{	
+		quantization_intervals = optimize_intervals_float_2D_pwr(oriData, r1, r2, R2, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//printf("quantization_intervals=%d\n",quantization_intervals);
+	
+	size_t i=0,j=0,I=0,J=0; 
+	int reqLength;
+	float realPrecision = pwrErrBound[I*R2+J];	
+	float pred1D, pred2D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+	
+	P0 = (float*)malloc(r2*sizeof(float));
+	memset(P0, 0, r2*sizeof(float));
+	P1 = (float*)malloc(r2*sizeof(float));
+	memset(P1, 0, r2*sizeof(float));
+		
+	float medianValue = 0;
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);	
+	short radExpo = getExponent_float(radius);
+	int updateReqLength = 1;
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[I*R2+J];
+			updateReqLength = 0;
+		}
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R2+J]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R2+J];
+				updateReqLength = 0;
+			}
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}
+
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);	
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max)
+{
+	size_t dataLength=r1*r2*r3;
+	
+	int blockEdgeSize = computeBlockEdgeSize_3D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	size_t R3 = 1+(r3-1)/blockEdgeSize;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*R1*R2*R3);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*R3*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);	
+	
+	compute_segment_precisions_float_3D(oriData, pwrErrBound, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);	
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_pwr(oriData, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i=0,j=0,k=0, I = 0, J = 0, K = 0;
+	int reqLength;
+	float realPrecision = pwrErrBound[0];		
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+	P0 = (float*)malloc(r23*sizeof(float));
+	P1 = (float*)malloc(r23*sizeof(float));
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	float medianValue = 0;
+	short radExpo = getExponent_float(radius);
+	int updateReqLength = 0;
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;realPrecision
+
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		if(updateReqLength==0)
+		{
+			computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;
+		}		
+		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[J];
+			updateReqLength = 0;
+		}		
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}			
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	K = 0;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R3+J]; //J==0
+		updateReqLength = 0;
+
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}		
+						
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++) //note that this j refers to fastest dimension (lowest order)
+		{
+			index = i*r3+j;		
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R3+J];
+				updateReqLength = 0;
+			}			
+		
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;			
+		I = 0;
+		J = 0;
+		if(k%blockEdgeSize==0)
+			K++;
+		realPrecision = pwrErrBound[K*R23]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}					
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			index = k*r23+j;	
+
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[K*R23+J];
+				updateReqLength = 0;			
+			}					
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			J = 0;
+			if(i%blockEdgeSize==0)
+				I++;
+			realPrecision = pwrErrBound[K*R23+I*R3+J]; //J==0
+			updateReqLength = 0;			
+			
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index = k*r23 + i*r3 + j;
+				if(j%blockEdgeSize==0)
+				{
+					J++;
+					realPrecision = pwrErrBound[K*R23+I*R3+J];
+					updateReqLength = 0;			
+				}							
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					if(updateReqLength==0)
+					{
+						computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;
+						updateReqLength = 1;
+					}							
+					
+					type[index] = 0;
+
+					addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+
+
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);
+	free(exactMidByteArray);
+}
+
+void createRangeGroups_float(float** posGroups, float** negGroups, int** posFlags, int** negFlags)
+{
+	size_t size = GROUP_COUNT*sizeof(float);
+	size_t size2 = GROUP_COUNT*sizeof(int);
+	*posGroups = (float*)malloc(size);
+	*negGroups = (float*)malloc(size);
+	*posFlags = (int*)malloc(size2);
+	*negFlags = (int*)malloc(size2);
+	memset(*posGroups, 0, size);
+	memset(*negGroups, 0, size);
+	memset(*posFlags, 0, size2);
+	memset(*negFlags, 0, size2);
+}
+
+void compressGroupIDArray_float(char* groupID, TightDataPointStorageF* tdps)
+{
+	size_t dataLength = tdps->dataSeriesLength;
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+
+	size_t i;
+	standGroupID[0] = groupID[0]+GROUP_COUNT; //plus an offset such that it would not be a negative number.
+	char lastGroupIDValue = groupID[0], curGroupIDValue;
+	int offset = 2*(GROUP_COUNT + 2);
+	for(i=1; i<dataLength;i++)
+	{
+		curGroupIDValue = groupID[i];
+		standGroupID[i] = (curGroupIDValue - lastGroupIDValue) + offset; 
+		lastGroupIDValue = curGroupIDValue;
+	}
+	
+	unsigned char* out = NULL;
+	size_t outSize;
+	
+	HuffmanTree* huffmanTree = SZ_Reset();
+	encode_withTree(huffmanTree, standGroupID, dataLength, &out, &outSize);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	tdps->pwrErrBoundBytes = out; //groupIDArray
+	tdps->pwrErrBoundBytes_size = outSize;
+	
+	free(standGroupID);
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_pwrGroup(float* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f)
+{
+	size_t i;
+	float *posGroups, *negGroups, *groups;
+	float pos_01_group = 0, neg_01_group = 0; //[0,1] and [-1,0]
+	int *posFlags, *negFlags, *flags;
+	int pos_01_flag = 0, neg_01_flag = 0;
+	createRangeGroups_float(&posGroups, &negGroups, &posFlags, &negFlags);
+	size_t nbBins = (size_t)(1/pwrErrRatio);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	int reqLength, status;
+	float medianValue = medianValue_f;
+	float realPrecision = (float)getRealPrecision_float(valueRangeSize, errBoundMode, absErrBound, relBoundRatio, &status);
+	if(realPrecision<0)
+		realPrecision = pwrErrRatio;
+	float realGroupPrecision; //precision (error) based on group ID
+	getPrecisionReqLength_float(realPrecision);
+	short radExpo = getExponent_float(valueRangeSize/2);
+	short lastGroupNum = 0, groupNum, grpNum = 0;
+	
+	double* groupErrorBounds = generateGroupErrBounds(errBoundMode, realPrecision, pwrErrRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	char *groupID = (char*) malloc(dataLength*sizeof(char));
+	char *gp = groupID;
+		
+	float* spaceFillingValue = oriData; 
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	int state;
+	float curData, decValue;
+	float pred;
+	float predAbsErr;
+	double interval = 0;
+	
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	
+	curData = spaceFillingValue[0];
+	groupNum = computeGroupNum_float(vce->data);
+
+	if(curData > 0 && groupNum >= 0)
+	{
+		groups = posGroups;
+		flags = posFlags;
+		grpNum = groupNum;
+	}
+	else if(curData < 0 && groupNum >= 0)
+	{
+		groups = negGroups;
+		flags = negFlags;
+		grpNum = groupNum;
+	}
+	else if(curData >= 0 && groupNum == -1)
+	{
+		groups = &pos_01_group;
+		flags = &pos_01_flag;
+		grpNum = 0;
+	}
+	else //curData < 0 && groupNum == -1
+	{
+		groups = &neg_01_group;
+		flags = &neg_01_flag;
+		grpNum = 0;
+	}
+
+	listAdd_float_group(groups, flags, groupNum, spaceFillingValue[0], vce->data, gp);
+	gp++;
+	
+	for(i=1;i<dataLength;i++)
+	{
+		curData = oriData[i];
+		//printf("i=%d, posGroups[3]=%f, negGroups[3]=%f\n", i, posGroups[3], negGroups[3]);
+		
+		groupNum = computeGroupNum_float(curData);
+		
+		if(curData > 0 && groupNum >= 0)
+		{
+			groups = posGroups;
+			flags = posFlags;
+			grpNum = groupNum;
+		}
+		else if(curData < 0 && groupNum >= 0)
+		{
+			groups = negGroups;
+			flags = negFlags;
+			grpNum = groupNum;
+		}
+		else if(curData >= 0 && groupNum == -1)
+		{
+			groups = &pos_01_group;
+			flags = &pos_01_flag;
+			grpNum = 0;
+		}
+		else //curData < 0 && groupNum == -1
+		{
+			groups = &neg_01_group;
+			flags = &neg_01_flag;
+			grpNum = 0;
+		}
+
+		if(groupNum>=GROUP_COUNT)
+		{
+			type[i] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			listAdd_float_group(groups, flags, lastGroupNum, curData, vce->data, gp);	//set the group number to be last one in order to get the groupID array as smooth as possible.		
+		}
+		else if(flags[grpNum]==0) //the dec value may not be in the same group
+		{	
+			type[i] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			//decGroupNum = computeGroupNum_float(vce->data);
+			
+			//if(decGroupNum < groupNum)
+			//	decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//else if(decGroupNum > groupNum)
+			//	decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//else
+			//	decValue = vce->data;
+			
+			decValue = vce->data;	
+			listAdd_float_group(groups, flags, groupNum, curData, decValue, gp);
+			lastGroupNum = curData>0?groupNum + 2: -(groupNum+2);
+		}
+		else //if flags[groupNum]==1, the dec value must be in the same group
+		{
+			pred = groups[grpNum];
+			predAbsErr = fabs(curData - pred);
+			realGroupPrecision = groupErrorBounds[grpNum]; //compute real error bound
+			interval = realGroupPrecision*2;
+			state = (predAbsErr/realGroupPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				decValue = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				decValue = pred - state*interval;
+			}
+			//decGroupNum = computeGroupNum_float(pred);
+			
+			if((decValue>0&&curData<0)||(decValue<0&&curData>=0))
+				decValue = 0;
+			//else
+			//{
+			//	if(decGroupNum < groupNum)
+			//		decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//	else if(decGroupNum > groupNum)
+			//		decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//	else
+			//		decValue = pred;				
+			//}
+			
+			if(fabs(curData-decValue)>realGroupPrecision)
+			{	
+				type[i] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+				decValue = vce->data;	
+			}
+			
+			listAdd_float_group(groups, flags, groupNum, curData, decValue, gp);			
+			lastGroupNum = curData>=0?groupNum + 2: -(groupNum+2);			
+		}
+		gp++;	
+
+	}
+	
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	//combineTypeAndGroupIDArray(nbBins, dataLength, &type, groupID);
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, nbBins, NULL, 0, radExpo);	
+	
+	compressGroupIDArray_float(groupID, tdps);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupID);
+	free(groupErrorBounds);
+	
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize)
+{
+        TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ_pwrGroup(oriData, dataLength, confparams_cpr->errorBoundMode, 
+        absErrBound, relBoundRatio, pwrErrRatio, 
+        valueRangeSize, medianValue_f);
+
+        convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+        if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+                SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+        free_TightDataPointStorageF(tdps);
+}
+
+#include <stdbool.h>
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float min, float max){
+
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ(log_data, dataLength, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+            SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float min, float max){
+
+	size_t dataLength = r1 * r2;
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_2D_MDQ(log_data, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+            SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max){
+
+	size_t dataLength = r1 * r2 * r3;
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_3D_MDQ(log_data, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+            SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float valueRangeSize, float medianValue_f,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero){
+	float multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	float median_log = sqrt(fabs(nearZero * max));
+
+	TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ_MSST19(oriData, dataLength, pwrErrRatio, valueRangeSize, median_log);
+
+	tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+	if(!(*positive)){
+		unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(ZSTD_COMPRESSOR, 3, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float valueRangeSize,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero){
+
+	size_t dataLength = r1 * r2;
+
+	float multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	float median_log = sqrt(fabs(nearZero * max));
+
+    TightDataPointStorageF* tdps = SZ_compress_float_2D_MDQ_MSST19(oriData, r1, r2, pwrErrRatio, valueRangeSize, median_log);
+    tdps->minLogValue = nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+
+    if(!*positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+            SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float valueRangeSize, unsigned char* signs, bool* positive, float min, float max, float nearZero){
+
+	size_t dataLength = r1 * r2 * r3;
+
+	float multiplier = pow((1+pwrErrRatio), -3.0001);
+	for(int i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			oriData[i] = nearZero * multiplier;
+		}
+	}
+
+	float median_log = sqrt(fabs(nearZero * max));
+
+	TightDataPointStorageF* tdps = SZ_compress_float_3D_MDQ_MSST19(oriData, r1, r2, r3, pwrErrRatio, valueRangeSize, median_log);
+	tdps->minLogValue =  nearZero / ((1+pwrErrRatio)*(1+pwrErrRatio));
+
+	if(!*positive){
+		unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	if(*outSize>3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + sizeof(float)*dataLength)
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
diff --git a/deps/SZ/sz/src/sz_float_ts.c b/deps/SZ/sz/src/sz_float_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea29245ec5b7ab383c47efd2cc4d48cc5e81128c
--- /dev/null
+++ b/deps/SZ/sz/src/sz_float_ts.c
@@ -0,0 +1,207 @@
+/**
+ *  @file sz_float.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_float_ts.h"
+
+unsigned int optimize_intervals_float_1D_ts(float *oriData, size_t dataLength, float* preData, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = preData[i];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_ts(float *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, float valueRangeSize, float medianValue_f)
+{
+	float* preStepData = (float*)(multisteps->hist_data);
+
+	//store the decompressed data
+	//float* decData = (float*)malloc(sizeof(float)*dataLength);
+	//memset(decData, 0, sizeof(float)*dataLength);
+	float* decData = preStepData;
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_ts(oriData, dataLength, preStepData, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[0] = vce->data;
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[1] = vce->data;	
+	
+	int state = 0;
+	double checkRadius = 0;
+	float curData = 0;
+	float pred = 0;
+	float predAbsErr = 0;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		pred = preStepData[i];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+				
+			//double-check the prediction error in case of machine-epsilon impact	
+			if(fabs(curData-pred)>realPrecision)
+			{	
+				type[i] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);		
+				decData[i] = vce->data;
+			}
+			else
+			{
+				decData[i] = pred;
+			}
+			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		decData[i] = vce->data;
+	}//end of for
+		
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+		
+	//memcpy(preStepData, decData, dataLength*sizeof(float)); //update the data
+	//free(decData);
+	
+	return tdps;
+}
+
+
diff --git a/deps/SZ/sz/src/sz_int16.c b/deps/SZ/sz/src/sz_int16.c
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c229930632d3091eab006a5cc5251fa53a16e
--- /dev/null
+++ b/deps/SZ/sz/src/sz_int16.c
@@ -0,0 +1,1385 @@
+/**
+ *  @file sz_int16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int16, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int16.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int16_2D(int16_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int16_3D(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int16_4D(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int16_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt16Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_INT16_MAX) pred = SZ_INT16_MAX;
+			if(pred<SZ_INT16_MIN) pred = SZ_INT16_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt16Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int16_StoreOriData(int16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int16_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int16ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_1D(unsigned char** newByteData, int16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int16_2D_MDQ(int16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int16_t*)malloc(r2*sizeof(int16_t));
+	memset(P0, 0, r2*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r2*sizeof(int16_t));
+	memset(P1, 0, r2*sizeof(int16_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt16Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			P1[1] = SZ_INT16_MIN;
+		else
+			P1[1] = SZ_INT16_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[j] = SZ_INT16_MIN;
+			else
+				P1[j] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P0[0] = SZ_INT16_MIN;
+			else
+				P0[0] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[j] = SZ_INT16_MIN;
+				else
+					P0[j] = SZ_INT16_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int16_NoCkRngeNoGzip_2D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int16_3D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int16_t*)malloc(r23*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r23*sizeof(int16_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			P1[1] = SZ_INT16_MIN;
+		else
+			P1[1] = SZ_INT16_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[j] = SZ_INT16_MIN;
+			else
+				P1[j] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[index] = SZ_INT16_MIN;
+			else
+				P1[index] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index] = SZ_INT16_MIN;
+				else
+					P1[index] = SZ_INT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P0[0] = SZ_INT16_MIN;
+			else
+				P0[0] = SZ_INT16_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[j] = SZ_INT16_MIN;
+				else
+					P0[j] = SZ_INT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[index2D] = SZ_INT16_MIN;
+				else
+					P0[index2D] = SZ_INT16_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_3D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int16_4D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int16_t*)malloc(r34*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r34*sizeof(int16_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[index2D] = SZ_INT16_MIN;
+			else
+				P1[index2D] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index2D] = SZ_INT16_MIN;
+				else
+					P1[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index2D] = SZ_INT16_MIN;
+				else
+					P1[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P1[index2D] = SZ_INT16_MIN;
+					else
+						P1[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[index2D] = SZ_INT16_MIN;
+				else
+					P0[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_INT16_MIN)
+							P0[index2D] = SZ_INT16_MIN;
+						else
+							P0[index2D] = SZ_INT16_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt16Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int16_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_4D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int16_withinRange(unsigned char** newByteData, int16_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*2);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 2;
+	tdps->dataTypeSize = convertDataTypeSize(sizeof(int16_t));
+	
+	int16_t value = oriData[0];
+	int16ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int16_t)+SZ_SIZE_TYPE; //8==3+1+4(int16_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int16_wRngeNoGzip(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int16_t minValue = computeRangeSize_int(oriData, SZ_INT16, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int16(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int16_t minValue = (int16_t)computeRangeSize_int(oriData, SZ_INT16, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int16_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_int32.c b/deps/SZ/sz/src/sz_int32.c
new file mode 100644
index 0000000000000000000000000000000000000000..7b559c94a8461c0eed1fa4014d5fd15def2b2f3a
--- /dev/null
+++ b/deps/SZ/sz/src/sz_int32.c
@@ -0,0 +1,1269 @@
+/**
+ *  @file sz_int32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int32, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int32.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int32_2D(int32_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = r1*r2/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int32_3D(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int32_4D(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int32_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt32Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int32_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt32Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int32_StoreOriData(int32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int32_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int32ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_1D(unsigned char** newByteData, int32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int32_2D_MDQ(int32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int32_t pred1D, pred2D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int32_t*)malloc(r2*sizeof(int32_t));
+	memset(P0, 0, r2*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r2*sizeof(int32_t));
+	memset(P1, 0, r2*sizeof(int32_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt32Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int32_NoCkRngeNoGzip_2D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int32_3D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int32_t pred1D, pred2D, pred3D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int32_t*)malloc(r23*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r23*sizeof(int32_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_3D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_int32_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int32_4D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int32_t pred1D, pred2D, pred3D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int32_t*)malloc(r34*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r34*sizeof(int32_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt32Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int32_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_4D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int32_withinRange(unsigned char** newByteData, int32_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 4;
+	tdps->dataTypeSize = convertDataTypeSize(sizeof(int32_t));
+	
+	int32_t value = oriData[0];
+	int32ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int32_t)+SZ_SIZE_TYPE; //8==3+1+4(int32_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int32_wRngeNoGzip(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int32_t minValue = computeRangeSize_int(oriData, SZ_INT32, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int32(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int32_t minValue = (int32_t)computeRangeSize_int(oriData, SZ_INT32, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int32_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_int64.c b/deps/SZ/sz/src/sz_int64.c
new file mode 100644
index 0000000000000000000000000000000000000000..065fb16e49dcd2e68a546610a85d9b3f17c44154
--- /dev/null
+++ b/deps/SZ/sz/src/sz_int64.c
@@ -0,0 +1,1269 @@
+/**
+ *  @file sz_int64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int64, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int64.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int64_2D(int64_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int64_3D(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int64_4D(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int64_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt64Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred;
+	int64_t predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt64Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int64_StoreOriData(int64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int64_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int64ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_1D(unsigned char** newByteData, int64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int64_2D_MDQ(int64_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int64_t*)malloc(r2*sizeof(int64_t));
+	memset(P0, 0, r2*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r2*sizeof(int64_t));
+	memset(P1, 0, r2*sizeof(int64_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt64Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int64_NoCkRngeNoGzip_2D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int64_3D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int64_t*)malloc(r23*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r23*sizeof(int64_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_3D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int64_4D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int64_t*)malloc(r34*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r34*sizeof(int64_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt64Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int64_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_4D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int64_withinRange(unsigned char** newByteData, int64_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 8;
+	
+	int64_t value = oriData[0];
+	int64ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int64_t)+SZ_SIZE_TYPE; //8==3+1+4(int64_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int64_wRngeNoGzip(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int64_t minValue = computeRangeSize_int(oriData, SZ_INT64, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int64(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int64_t minValue = (int64_t)computeRangeSize_int(oriData, SZ_INT64, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int64_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_int8.c b/deps/SZ/sz/src/sz_int8.c
new file mode 100644
index 0000000000000000000000000000000000000000..83febd0de64b14be3915a9fa81e5fdb907345fe6
--- /dev/null
+++ b/deps/SZ/sz/src/sz_int8.c
@@ -0,0 +1,1385 @@
+/**
+ *  @file sz_int8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int8, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int8.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int8_2D(int8_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int8_3D(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int8_4D(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int8_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt8Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_INT8_MAX) pred = SZ_INT8_MAX;
+			if(pred<SZ_INT8_MIN) pred = SZ_INT8_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt8Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int8_StoreOriData(int8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int8_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+		
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;			
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			*p = oriData[i];
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_1D(unsigned char** newByteData, int8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int8_2D_MDQ(int8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int8_t*)malloc(r2*sizeof(int8_t));
+	memset(P0, 0, r2*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r2*sizeof(int8_t));
+	memset(P1, 0, r2*sizeof(int8_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt8Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			P1[1] = SZ_INT8_MIN;
+		else
+			P1[1] = SZ_INT8_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[j] = SZ_INT8_MIN;
+			else
+				P1[j] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P0[0] = SZ_INT8_MIN;
+			else
+				P0[0] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[j] = SZ_INT8_MIN;
+				else
+					P0[j] = SZ_INT8_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int8_NoCkRngeNoGzip_2D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int8_3D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int8_t*)malloc(r23*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r23*sizeof(int8_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			P1[1] = SZ_INT8_MIN;
+		else
+			P1[1] = SZ_INT8_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[j] = SZ_INT8_MIN;
+			else
+				P1[j] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[index] = SZ_INT8_MIN;
+			else
+				P1[index] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index] = SZ_INT8_MIN;
+				else
+					P1[index] = SZ_INT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P0[0] = SZ_INT8_MIN;
+			else
+				P0[0] = SZ_INT8_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[j] = SZ_INT8_MIN;
+				else
+					P0[j] = SZ_INT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[index2D] = SZ_INT8_MIN;
+				else
+					P0[index2D] = SZ_INT8_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_3D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_int8_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int8_4D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int8_t*)malloc(r34*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r34*sizeof(int8_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[index2D] = SZ_INT8_MIN;
+			else
+				P1[index2D] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index2D] = SZ_INT8_MIN;
+				else
+					P1[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index2D] = SZ_INT8_MIN;
+				else
+					P1[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P1[index2D] = SZ_INT8_MIN;
+					else
+						P1[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[index2D] = SZ_INT8_MIN;
+				else
+					P0[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_INT8_MIN)
+							P0[index2D] = SZ_INT8_MIN;
+						else
+							P0[index2D] = SZ_INT8_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt8Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int8_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_4D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int8_withinRange(unsigned char** newByteData, int8_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char));
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 1;
+	
+	int8_t value = oriData[0];
+	//intToBytes_bigEndian(tdps->exactDataBytes, value);
+	memcpy(tdps->exactDataBytes, &value, 1);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int8_t)+SZ_SIZE_TYPE; //8==3+1+4(int8_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int8_wRngeNoGzip(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int8_t minValue = computeRangeSize_int(oriData, SZ_INT8, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int8(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int8_t minValue = (int8_t)computeRangeSize_int(oriData, SZ_INT8, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int8_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_interface.F90 b/deps/SZ/sz/src/sz_interface.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4a3f2a04390b4bba2ed7f5b56f76fbe9e2e686f3
--- /dev/null
+++ b/deps/SZ/sz/src/sz_interface.F90
@@ -0,0 +1,1207 @@
+!  @file   sz_interface.F90
+!  @author Sheng Di (disheng222@gmail.com or sdi1@anl.gov)
+!  @date   June, 2016
+!  @ Mathematics and Computer Science (MCS)
+!  @ Argonne National Laboratory, Lemont, USA.
+!  @brief  The key Fortran binding file to connect C language and Fortran (Fortran part)
+
+
+MODULE SZ
+	use :: ISO_C_BINDING
+	INTERFACE SZ_Compress
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4		
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_Rev	
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev
+	END INTERFACE SZ_Compress
+
+	INTERFACE SZ_Decompress
+		MODULE PROCEDURE SZ_Decompress_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d5_Fortran_REAL_K8
+	END INTERFACE SZ_Decompress
+
+	INTERFACE SZ_BatchAddVar
+		MODULE PROCEDURE SZ_BatchAddVar_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d5_Fortran_REAL_K8
+	END INTERFACE SZ_BatchAddVar
+
+	INTERFACE SZ_GetVarData
+		MODULE PROCEDURE SZ_GetVarData_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d5_Fortran_REAL_K8								
+	END INTERFACE SZ_GetVarData
+
+	CONTAINS
+
+!Init and Finalize
+
+	SUBROUTINE SZ_Init(config_File,ierr)
+		implicit none
+		CHARACTER(len=32) :: config_File
+		INTEGER :: l,ierr
+		CALL SZ_Init_c(config_File,len(trim(config_File)),ierr)
+	END SUBROUTINE SZ_Init
+	
+	SUBROUTINE SZ_Finalize()
+		CALL SZ_Finalize_c()
+	END SUBROUTINE SZ_Finalize
+
+	SUBROUTINE SZ_FREE_VARSET(mode)
+		implicit none
+		INTEGER :: mode !0,1,2, or 3
+		
+		CALL SZ_Freevarset_c(mode)
+	END SUBROUTINE SZ_FREE_VARSET
+
+!batch-mode functions
+
+	SUBROUTINE SZ_BatchDelVar(varName, ierr)
+		implicit none
+		CHARACTER(len=*) :: varName
+		INTEGER :: ierr
+		CALL SZ_BatchDelVar_c(varName, len(trim(varName)), ierr)
+	END SUBROUTINE SZ_BatchDelVar
+
+	SUBROUTINE SZ_Batch_Compress(Bytes, OutSize)
+		implicit none
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=C_SIZE_T) :: OutSize
+		INTEGER(kind=C_SIZE_T) :: alloSize
+		
+		CALL compute_total_batch_size_c(alloSize)
+		allocate(Bytes(alloSize)) !allocate the largest possible memory
+
+		CALL SZ_Batch_Compress_c(Bytes, OutSize)
+		
+	END SUBROUTINE SZ_Batch_Compress
+
+	SUBROUTINE SZ_Batch_Decompress(Bytes, OutSize, ierr)
+		implicit none
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=C_SIZE_T) :: OutSize
+		INTEGER :: ierr
+
+		CALL SZ_Batch_Decompress_c(Bytes, OutSize, ierr)
+	END SUBROUTINE SZ_Batch_Decompress
+
+!Compress functions that extract the dimension sizes and call C translation interface (single-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1)) !allocate the largest possible memory
+			
+		CALL SZ_Compress_d1_Float(VAR, Bytes, OutSize, R1)
+	
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=4) :: ErrBoundMode
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d1_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		R = R1*R2
+		
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+	
+		CALL SZ_Compress_d2_Float(VAR, Bytes, OutSize, R1, R2)
+	
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		
+		allocate(Bytes(8*R))  !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Float(VAR, Bytes, OutSize, R1, R2, R3)
+		
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Float(VAR, Bytes, OutSize, R1, R2, R3, R4)
+		
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Float(VAR, Bytes, OutSize, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS
+
+!Compress functions that extract the dimension sizes and call C translation interfaces （double-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double(VAR, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		CALL SZ_Compress_d1_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Double(VAR, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory		
+		
+		CALL SZ_Compress_d2_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Double(VAR, Bytes, OutSize, R1, R2, R3)
+	
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Double(VAR, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Double(VAR, Bytes, OutSize, R1, R2, R3, R4, R5)
+	
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS
+
+!Comrpession functions with reserved value
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Float_Rev(VAR, ReValue, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=1), DIMENSION(:), allocatable :: temp
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4, R5)
+		
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev
+
+!Compress functions that extract the dimension sizes and call C translation interfaces （double-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double_Rev(VAR, ReValue, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d2_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev
+
+!Decompress functions
+
+	SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K4(Bytes, VAR, R1) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1))
+	
+		CALL SZ_Decompress_d1_Float(Bytes, BLength, VAR, R1)
+	END SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K4(Bytes, VAR, R1, R2) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes	
+		REAL(KIND=4), DIMENSION(:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2))
+	
+		CALL SZ_Decompress_d2_Float(Bytes, BLength, VAR, R1, R2)
+	END SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3))
+	
+		CALL SZ_Decompress_d3_Float(Bytes, BLength, VAR, R1, R2, R3)
+	END SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3, R4) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4))
+	
+		CALL SZ_Decompress_d4_Float(Bytes, BLength, VAR, R1, R2, R3, R4)
+	END SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3, R4, R5) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4,R5))
+	
+		CALL SZ_Decompress_d5_Float(Bytes, BLength, VAR, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K8(Bytes, VAR, R1) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1))
+	
+		CALL SZ_Decompress_d1_Double(Bytes, BLength, VAR, R1)
+	END SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K8(Bytes, VAR, R1, R2) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2))
+	
+		CALL SZ_Decompress_d2_Double(Bytes, BLength, VAR, R1, R2)
+	END SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3))
+	
+		CALL SZ_Decompress_d3_Double(Bytes, BLength, VAR, R1, R2, R3)
+	END SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3, R4) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4))
+	
+		CALL SZ_Decompress_d4_Double(Bytes, BLength, VAR, R1, R2, R3, R4)
+	END SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3, R4, R5) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5, BLength
+		BLength = SIZE(Bytes, 1)
+		allocate(VAR(R1,R2,R3,R4,R5))
+	
+		CALL SZ_Decompress_d5_Double(Bytes, BLength, VAR, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K8
+
+!--------batch add float
+
+	SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K4(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+
+		CALL SZ_batchAddVar_d1_Float(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K4(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none	
+		INTEGER(kind=4) :: varID			
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		CALL SZ_batchAddVar_d2_Float(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K4(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none		
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+
+		CALL SZ_batchAddVar_d3_Float(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+	END SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K4(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		CALL SZ_batchAddVar_d4_Float(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+	END SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K4(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+
+		CALL SZ_batchAddVar_d5_Float(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, &
+                R3, R4, R5)
+	END SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K4
+
+!------batch add double
+	SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K8(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+
+		CALL SZ_batchAddVar_d1_Double(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K8(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		CALL SZ_batchAddVar_d2_Double(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K8(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+
+		CALL SZ_batchAddVar_d3_Double(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+	END SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K8(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		CALL SZ_batchAddVar_d4_Double(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+	END SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K8(varID, varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		INTEGER(kind=4) :: varID		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+
+		CALL SZ_batchAddVar_d5_Double(varID, varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, & 
+		R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K8
+
+	SUBROUTINE SZ_GetVarDim(varName, DIMEN, R1, R2, R3, R4, R5)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		INTEGER(kind=4), INTENT(OUT) :: DIMEN
+		INTEGER(kind=C_SIZE_T), INTENT(OUT) :: R1, R2, R3, R4, R5
+		
+		CALL SZ_GetVarDim_c(varName, len(trim(varName)), DIMEN, R1, R2, R3, R4, R5)
+		
+	END SUBROUTINE SZ_GetVarDim
+
+	SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K4
+	
+	SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K8
+	
+	SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K8
+
+END MODULE SZ
diff --git a/deps/SZ/sz/src/sz_omp.c b/deps/SZ/sz/src/sz_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..362b6bf6955103d6f422012982ebd3b5f1512800
--- /dev/null
+++ b/deps/SZ/sz/src/sz_omp.c
@@ -0,0 +1,986 @@
+/**
+ *  @file sz_omp.c
+ *  @author Xin Liang
+ *  @date July, 2017
+ *  @brief the implementation of openMP version
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "sz_omp.h"
+#include <math.h>
+#include <time.h>
+
+double sz_wtime(){
+#ifdef _OPENMP
+    return omp_get_wtime();
+#else
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+
+    return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0;
+#endif
+}
+
+int sz_get_max_threads(){
+#ifdef _OPENMP
+    return omp_get_max_threads();
+#else
+    return 1;
+#endif
+}
+
+int sz_get_thread_num(){
+#ifdef _OPENMP
+    return omp_get_thread_num();
+#else
+    return 0;
+#endif
+}
+
+void sz_set_num_threads(int nthreads){
+#ifdef _OPENMP
+    omp_set_num_threads(nthreads);
+#endif
+}
+
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size){
+
+	float elapsed_time = 0.0;
+
+	elapsed_time = -sz_wtime();
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		// quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision);
+		quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision);
+		//quantization_intervals = 32768;
+		printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision);
+		// exit(0);		
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+	elapsed_time += sz_wtime();
+	printf("opt interval time: %.4f\n", elapsed_time);
+
+	elapsed_time = -sz_wtime();
+	int thread_num = sz_get_max_threads();
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+		thread_num = num_x * num_y * num_z;
+	}
+	sz_set_num_threads(thread_num);
+	// calculate block dims
+	printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+	// printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	
+	// printf("malloc blockinfo array start\n");
+	// fflush(stdout);
+
+	size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(float);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int));
+	float * mean = malloc(num_blocks * sizeof(float));
+	float * buffer0, * buffer1;
+	buffer0 = (float *) malloc(buffer_size * thread_num);
+	buffer1 = (float *) malloc(buffer_size * thread_num);
+	unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(float)));
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks);
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t));
+	memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t));
+	
+	size_t stateNum = quantization_intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int num_yz = num_y * num_z;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		float * data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		float *P0, *P1; // buffer
+		// P0 = (float *) malloc(buffer_size);
+		// P1 = (float *) malloc(buffer_size);
+		P0 = buffer0 + id * early_blockcount_y * early_blockcount_z;
+		P1 = buffer1 + id * early_blockcount_y * early_blockcount_z;
+		unpredictable_count[id] = SZ_compress_float_3D_MDQ_RA_block(data_pos, mean + id, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, P0, P1, type, unpredictable_data);
+		// free(P0);
+		// free(P1);
+	}
+	elapsed_time += sz_wtime();
+	printf("compression and quantization time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpredictable_count[i]);
+	// }
+	// printf("\n");
+	// printf("total_unpred num: %d\n", total_unpred);
+	// printf("Block wise compression end, num_elements %ld\n", num_elements);
+	// huffman encode
+
+	size_t nodeCount = 0;
+	Huffman_init_openmp(huffmanTree, result_type, num_elements, thread_num, freq);
+	elapsed_time += sz_wtime();
+	printf("Build Huffman: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	for (size_t i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++;
+	nodeCount = nodeCount*2-1;
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		total_unpred += unpredictable_count[i];
+		// printf("%d: %d mean %.2f\n", i, unpredictable_count[i], mean[i]);
+	}
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	size_t enCodeSize = 0;
+
+	intToBytes_bigEndian(result_pos, thread_num);
+	result_pos += 4;
+	floatToBytes(result_pos, realPrecision);
+	result_pos += sizeof(float);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += 4;
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+
+	memcpy(result_pos, unpredictable_count, num_blocks * sizeof(unsigned int));
+	result_pos += num_blocks * sizeof(unsigned int);
+	memcpy(result_pos, mean, num_blocks * sizeof(float));
+	result_pos += num_blocks * sizeof(float);	
+	// printf("unpred offset: %ld\n", result_pos - result);
+	// store unpredicable data
+	// float * unpred_pos = (float *) result_pos;
+	// for(int t=0; t<thread_num; t++){
+	// 	float * unpredictable_data = result_unpredictable_data + t * unpred_data_max_size;
+	// 	memcpy(result_pos, unpredictable_data, unpredictable_count[t] * sizeof(float));		
+	// 	result_pos += unpredictable_count[t]*sizeof(float);
+	// }
+	unpred_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		unpred_offset[t] = unpredictable_count[t-1] + unpred_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		memcpy(result_pos + unpred_offset[id] * sizeof(float), unpredictable_data, unpredictable_count[id] * sizeof(float));		
+	}
+	result_pos += total_unpred * sizeof(float);
+
+	elapsed_time += sz_wtime();
+	printf("write misc time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	size_t * block_pos = (size_t *) result_pos;
+	result_pos += num_blocks * sizeof(size_t);
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		unsigned char * encoding_buffer_pos = encoding_buffer + id * max_num_block_elements * sizeof(int);
+		size_t enCodeSize = 0;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		encode(huffmanTree, type, current_block_elements, encoding_buffer_pos, &enCodeSize);
+		block_pos[id] = enCodeSize;
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman encoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// for(int t=0; t<thread_num; t++){
+	// 	memcpy(result_pos, encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);
+	// 	result_pos += block_pos[t];
+	// }
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		memcpy(result_pos + block_offset[id], encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);		
+	}
+	result_pos += block_offset[thread_num - 1] + block_pos[thread_num - 1];
+
+	elapsed_time += sz_wtime();
+	printf("Final copy elapsed time: %.4f\n", elapsed_time);
+	// {
+	// 	int status;
+	// 	writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/comp001_type.dat", &status);
+	// }
+
+	// int status;
+	// writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/omp_type.dat", &status);
+	// printf("type array size: %ld\n", enCodeSize);
+	result_pos += enCodeSize;
+	size_t totalEncodeSize = 0;
+	totalEncodeSize = result_pos - result;
+	// printf("Total size %ld\n", totalEncodeSize);
+	free(freq);
+	free(buffer0);
+	free(buffer1);
+	free(treeBytes);
+	free(unpred_offset);
+	free(block_offset);
+	free(encoding_buffer);
+	free(mean);
+	free(result_unpredictable_data);
+	free(unpredictable_count);
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data){
+}
+
+void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data){
+}
+
+void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+	
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));	
+	
+	// printf("num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+	// fflush(stdout);
+	double elapsed_time = 0.0;
+	elapsed_time = -sz_wtime();
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+	
+	unsigned char * comp_data_pos = comp_data;
+	//int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	//comp_data_pos += meta_data_offset;
+
+	int thread_num = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += 4;
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+	}
+	
+	printf("number of blocks: %zu %zu %zu, thread_num %d\n", num_x, num_y, num_z, thread_num);
+	sz_set_num_threads(thread_num);
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	*data = (float*)malloc(sizeof(float)*num_elements);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+
+	float realPrecision = bytesToFloat(comp_data_pos);
+	comp_data_pos += sizeof(float);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(float);
+
+	size_t stateNum = intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	updateQuantizationInfo(intervals);
+	// exe_params->intvRadius = (int)((tdps->intervals - 1)/ 2);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(unsigned int);
+	size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos);
+	huffmanTree->allNodes = huffman_nodes;
+	// printf("Reconstruct huffman tree with node count %ld\n", nodeCount);
+	// fflush(stdout);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes);
+
+	comp_data_pos += 4 + tree_size;
+	unsigned int * unpred_count = (unsigned int *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(unsigned int);
+	float * mean_pos = (float *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(float);
+	float * result_unpredictable_data = (float *) comp_data_pos;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		unpred_offset[i] = total_unpred;
+		total_unpred += unpred_count[i];
+	}
+	comp_data_pos += total_unpred * sizeof(float);
+
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpred_count[i]);
+	// }
+	// printf("\n");
+	// for(int i=0; i<1000; i++){
+	// 	printf("%.2f ", result_unpredictable_data[i]);
+	// }
+	// printf("\ntotal_unpred num: %d\n", total_unpred);
+	
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d unpred offset %ld\n", i, unpred_offset[i]);
+	// 	for(int tmp=0; tmp<10; tmp++){
+	// 		printf("%.2f ", (result_unpredictable_data + unpred_offset[i])[tmp]);
+	// 	}
+	// 	printf("\n");
+	// }
+	// exit(0);
+	// printf("Block wise decompression start: %d %d %d\n", early_blockcount_x, early_blockcount_y, early_blockcount_z);
+	// fflush(stdout);
+	// decode(comp_data_pos, num_elements, root, result_type);
+	size_t * block_pos = (size_t *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(size_t);
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	int num_yz = num_y * num_z;
+	elapsed_time += sz_wtime();
+	printf("Read data info elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		decode(comp_data_pos + block_offset[id], current_blockcount_x*current_blockcount_y*current_blockcount_z, root, type);
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman decoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		float * data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		float * unpredictable_data = result_unpredictable_data + unpred_offset[id];
+		float mean = mean_pos[id];
+		// printf("\n%d\ndata_offset: %ld\n", t, offset_x * dim0_offset + offset_y * dim1_offset + offset_z);
+		// printf("mean: %.2f\n", mean);
+		// for(int tmp=0; tmp<10; tmp++){
+		// 	printf("%.2f ", unpredictable_data[tmp]);
+		// }
+		// printf("\n\n");
+		decompressDataSeries_float_3D_RA_block(data_pos, mean, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, type, unpredictable_data);
+	}	
+	elapsed_time += sz_wtime();
+	printf("Parallel decompress elapsed time: %.4f\n", elapsed_time);
+
+	free(block_offset);
+	free(result_type);
+	free(unpred_offset);
+	SZ_ReleaseHuffman(huffmanTree);
+}
+
+//Double Precision
+
+unsigned char * SZ_compress_double_1D_MDQ_openmp(double *oriData, size_t r1, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_double_2D_MDQ_openmp(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_double_3D_MDQ_openmp(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	float elapsed_time = 0.0;
+
+	elapsed_time = -sz_wtime();
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		// quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision);
+		quantization_intervals = optimize_intervals_double_3D_opt(oriData, r1, r2, r3, realPrecision);
+		//quantization_intervals = 32768;
+		printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision);
+		// exit(0);		
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+	elapsed_time += sz_wtime();
+	printf("opt interval time: %.4f\n", elapsed_time);
+
+	elapsed_time = -sz_wtime();
+	int thread_num = sz_get_max_threads();
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+		thread_num = num_x * num_y * num_z;
+	}
+	sz_set_num_threads(thread_num);
+	// calculate block dims
+	printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+	// printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	
+	// printf("malloc blockinfo array start\n");
+	// fflush(stdout);
+
+	size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(double);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int));
+	double * mean = malloc(num_blocks * sizeof(double));
+	double * buffer0, * buffer1;
+	buffer0 = (double *) malloc(buffer_size * thread_num);
+	buffer1 = (double *) malloc(buffer_size * thread_num);
+	unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(double)));
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks);
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t));
+	memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t));
+	
+	size_t stateNum = quantization_intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int num_yz = num_y * num_z;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		double * data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		double * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		double *P0, *P1; // buffer
+
+		P0 = buffer0 + id * early_blockcount_y * early_blockcount_z;
+		P1 = buffer1 + id * early_blockcount_y * early_blockcount_z;
+		unpredictable_count[id] = SZ_compress_double_3D_MDQ_RA_block(data_pos, mean + id, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, P0, P1, type, unpredictable_data);
+	}
+	elapsed_time += sz_wtime();
+	printf("compression and quantization time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpredictable_count[i]);
+	// }
+	// printf("\n");
+	// printf("total_unpred num: %d\n", total_unpred);
+	// printf("Block wise compression end, num_elements %ld\n", num_elements);
+	// huffman encode
+
+	size_t nodeCount = 0;
+	Huffman_init_openmp(huffmanTree, result_type, num_elements, thread_num, freq);
+	elapsed_time += sz_wtime();
+	printf("Build Huffman: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	for (size_t i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++;
+	nodeCount = nodeCount*2-1;
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		total_unpred += unpredictable_count[i];
+		// printf("%d: %d mean %.2f\n", i, unpredictable_count[i], mean[i]);
+	}
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	size_t enCodeSize = 0;
+
+	intToBytes_bigEndian(result_pos, thread_num);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += 4;
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+
+	memcpy(result_pos, unpredictable_count, num_blocks * sizeof(unsigned int));
+	result_pos += num_blocks * sizeof(unsigned int);
+	memcpy(result_pos, mean, num_blocks * sizeof(double));
+	result_pos += num_blocks * sizeof(double);	
+
+	unpred_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		unpred_offset[t] = unpredictable_count[t-1] + unpred_offset[t-1];
+	}
+	
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		double * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		memcpy(result_pos + unpred_offset[id] * sizeof(double), unpredictable_data, unpredictable_count[id] * sizeof(double));		
+	}
+	result_pos += total_unpred * sizeof(double);
+
+	elapsed_time += sz_wtime();
+	printf("write misc time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	size_t * block_pos = (size_t *) result_pos;
+	result_pos += num_blocks * sizeof(size_t);
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		unsigned char * encoding_buffer_pos = encoding_buffer + id * max_num_block_elements * sizeof(int);
+		size_t enCodeSize = 0;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		encode(huffmanTree, type, current_block_elements, encoding_buffer_pos, &enCodeSize);
+		block_pos[id] = enCodeSize;
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman encoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// for(int t=0; t<thread_num; t++){
+	// 	memcpy(result_pos, encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);
+	// 	result_pos += block_pos[t];
+	// }
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		memcpy(result_pos + block_offset[id], encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);		
+	}
+	result_pos += block_offset[thread_num - 1] + block_pos[thread_num - 1];
+
+	elapsed_time += sz_wtime();
+	printf("Final copy elapsed time: %.4f\n", elapsed_time);
+	// {
+	// 	int status;
+	// 	writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/comp001_type.dat", &status);
+	// }
+
+	// int status;
+	// writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/omp_type.dat", &status);
+	// printf("type array size: %ld\n", enCodeSize);
+	result_pos += enCodeSize;
+	size_t totalEncodeSize = 0;
+	totalEncodeSize = result_pos - result;
+	// printf("Total size %ld\n", totalEncodeSize);
+	free(freq);
+	free(buffer0);
+	free(buffer1);
+	free(treeBytes);
+	free(unpred_offset);
+	free(block_offset);
+	free(encoding_buffer);
+	free(mean);
+	free(result_unpredictable_data);
+	free(unpredictable_count);
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+void decompressDataSeries_double_1D_openmp(double** data, size_t r1, unsigned char* comp_data){
+}
+
+void decompressDataSeries_double_2D_openmp(double** data, size_t r1, size_t r2, unsigned char* comp_data){
+}
+
+void decompressDataSeries_double_3D_openmp(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));	
+	
+	// printf("num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+	// fflush(stdout);
+	double elapsed_time = 0.0;
+	elapsed_time = -sz_wtime();
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+	
+	unsigned char * comp_data_pos = comp_data;
+	//int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	//comp_data_pos += meta_data_offset;
+
+	int thread_num = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+	}
+	
+	printf("number of blocks: %zu %zu %zu, thread_num %d\n", num_x, num_y, num_z, thread_num);
+	sz_set_num_threads(thread_num);
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	*data = (double*)malloc(sizeof(double)*num_elements);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(double);
+
+	size_t stateNum = intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	updateQuantizationInfo(intervals);
+	// exe_params->intvRadius = (int)((tdps->intervals - 1)/ 2);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(unsigned int);
+	size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos);
+	huffmanTree->allNodes = huffman_nodes;
+	// printf("Reconstruct huffman tree with node count %ld\n", nodeCount);
+	// fflush(stdout);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes);
+
+	comp_data_pos += 4 + tree_size;
+	unsigned int * unpred_count = (unsigned int *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(unsigned int);
+	double * mean_pos = (double *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(double);
+	double * result_unpredictable_data = (double *) comp_data_pos;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		unpred_offset[i] = total_unpred;
+		total_unpred += unpred_count[i];
+	}
+	comp_data_pos += total_unpred * sizeof(double);
+
+	size_t * block_pos = (size_t *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(size_t);
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	int num_yz = num_y * num_z;
+	elapsed_time += sz_wtime();
+	printf("Read data info elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		decode(comp_data_pos + block_offset[id], current_blockcount_x*current_blockcount_y*current_blockcount_z, root, type);
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman decoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		double * data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		double * unpredictable_data = result_unpredictable_data + unpred_offset[id];
+		double mean = mean_pos[id];
+
+		decompressDataSeries_double_3D_RA_block(data_pos, mean, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, type, unpredictable_data);
+	}	
+	elapsed_time += sz_wtime();
+	printf("Parallel decompress elapsed time: %.4f\n", elapsed_time);
+
+	free(block_offset);
+	free(result_type);
+	free(unpred_offset);
+	SZ_ReleaseHuffman(huffmanTree);
+}
+
+void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq){
+
+	size_t i;
+	// size_t *freq = (size_t *)malloc(thread_num*huffmanTree->allNodes*sizeof(size_t));
+	// memset(freq, 0, thread_num*huffmanTree->allNodes*sizeof(size_t));
+	size_t block_size = (length - 1)/ thread_num + 1;
+	size_t block_residue = length - (thread_num - 1) * block_size;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int * s_pos = s + id * block_size;
+		size_t * freq_pos = freq + id * huffmanTree->allNodes;
+		if(id < thread_num - 1){
+			for(size_t i=0; i<block_size; i++){
+				freq_pos[s_pos[i]] ++;
+			}
+		}
+		else{
+			for(size_t i=0; i<block_residue; i++){
+				freq_pos[s_pos[i]] ++;
+			}
+		}
+	}
+	size_t * freq_pos = freq + huffmanTree->allNodes;
+	for(int t=1; t<thread_num; t++){
+		for(i = 0; i<huffmanTree->allNodes; i++){
+			freq[i] += freq_pos[i];
+		}
+		freq_pos += huffmanTree->allNodes;
+	}
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i]) 
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+ 
+	while (huffmanTree->qend > 2) 
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+ 
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	// free(freq);
+}
+
+
+
diff --git a/deps/SZ/sz/src/sz_stats.c b/deps/SZ/sz/src/sz_stats.c
new file mode 100644
index 0000000000000000000000000000000000000000..dbd91c04eccf2e67ccabb8097d892c8c8a1c6880
--- /dev/null
+++ b/deps/SZ/sz/src/sz_stats.c
@@ -0,0 +1,60 @@
+#include <sz_stats.h>
+
+sz_stats sz_stat;
+
+void writeBlockInfo(int use_mean, size_t blockSize, size_t regressionBlocks, size_t totalBlocks)
+{
+	sz_stat.use_mean = use_mean;
+	sz_stat.blockSize = blockSize;
+	sz_stat.lorenzoBlocks = totalBlocks - regressionBlocks;
+	sz_stat.regressionBlocks = regressionBlocks;
+	sz_stat.totalBlocks = totalBlocks;
+	sz_stat.lorenzoPercent = 1.0f*sz_stat.lorenzoBlocks/(float)totalBlocks;
+	sz_stat.regressionPercent = 1.0f*regressionBlocks/(float)totalBlocks;
+}
+
+void writeHuffmanInfo(size_t huffmanTreeSize, size_t huffmanCodingSize, size_t totalDataSize, int huffmanNodeCount)
+{
+	sz_stat.huffmanTreeSize = huffmanTreeSize;
+	sz_stat.huffmanCodingSize = huffmanCodingSize;
+	sz_stat.huffmanCompressionRatio = 1.0f*totalDataSize/(huffmanTreeSize+huffmanCodingSize);
+	sz_stat.huffmanNodeCount = huffmanNodeCount;
+}
+
+void writeZstdCompressionRatio(float zstdCompressionRatio)
+{
+	sz_stat.zstdCompressionRatio = zstdCompressionRatio;
+}	
+
+
+void writeUnpredictDataCounts(size_t unpredictCount, size_t totalNumElements)
+{
+	sz_stat.unpredictCount = unpredictCount;
+	sz_stat.unpredictPercent = 1.0f*unpredictCount/totalNumElements;
+}
+
+void printSZStats()
+{
+	printf("===============stats about sz================\n");
+	if(sz_stat.use_mean)
+		printf("use_mean:                  YES\n");
+	else
+		printf("use_mean:                  NO\n");
+		
+	printf("blockSize                  %zu\n", sz_stat.blockSize);
+	printf("lorenzoPercent             %f\n", sz_stat.lorenzoPercent);
+	printf("regressionPercent          %f\n", sz_stat.regressionPercent);
+	printf("lorenzoBlocks              %zu\n", sz_stat.lorenzoBlocks);
+	printf("regressionBlocks           %zu\n", sz_stat.regressionBlocks);
+	printf("totalBlocks                %zu\n", sz_stat.totalBlocks);
+	
+	printf("huffmanTreeSize            %zu\n", sz_stat.huffmanTreeSize);
+	printf("huffmanCodingSize          %zu\n", sz_stat.huffmanCodingSize);
+	printf("huffmanCompressionRatio    %f\n", sz_stat.huffmanCompressionRatio);
+	printf("huffmanNodeCount           %d\n", sz_stat.huffmanNodeCount);
+	
+	//printf("zstdCompressionRatio       %f\n", sz_stat.zstdCompressionRatio);
+
+	printf("unpredictCount             %zu\n", sz_stat.unpredictCount);
+	printf("unpredictPercent           %f\n", sz_stat.unpredictPercent);
+}
diff --git a/deps/SZ/sz/src/sz_uint16.c b/deps/SZ/sz/src/sz_uint16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ae557da356f70f274299dd036a0051bba0346cb9
--- /dev/null
+++ b/deps/SZ/sz/src/sz_uint16.c
@@ -0,0 +1,1385 @@
+/**
+ *  @file sz_uint16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint16, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint16.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint16_2D(uint16_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint16_3D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint16_4D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint16_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt16Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_UINT16_MAX) pred = SZ_UINT16_MAX;
+			if(pred<SZ_UINT16_MIN) pred = SZ_UINT16_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt16Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint16_StoreOriData(uint16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint16_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];	
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;	
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int16ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint16_2D_MDQ(uint16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint16_t*)malloc(r2*sizeof(uint16_t));
+	memset(P0, 0, r2*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r2*sizeof(uint16_t));
+	memset(P1, 0, r2*sizeof(uint16_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt16Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			P1[1] = SZ_UINT16_MIN;
+		else
+			P1[1] = SZ_UINT16_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[j] = SZ_UINT16_MIN;
+			else
+				P1[j] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P0[0] = SZ_UINT16_MIN;
+			else
+				P0[0] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[j] = SZ_UINT16_MIN;
+				else
+					P0[j] = SZ_UINT16_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint16_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint16_3D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint16_t*)malloc(r23*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r23*sizeof(uint16_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			P1[1] = SZ_UINT16_MIN;
+		else
+			P1[1] = SZ_UINT16_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[j] = SZ_UINT16_MIN;
+			else
+				P1[j] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[index] = SZ_UINT16_MIN;
+			else
+				P1[index] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index] = SZ_UINT16_MIN;
+				else
+					P1[index] = SZ_UINT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P0[0] = SZ_UINT16_MIN;
+			else
+				P0[0] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[j] = SZ_UINT16_MIN;
+				else
+					P0[j] = SZ_UINT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[index2D] = SZ_UINT16_MIN;
+				else
+					P0[index2D] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_uint16_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint16_4D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint16_t*)malloc(r34*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r34*sizeof(uint16_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[index2D] = SZ_UINT16_MIN;
+			else
+				P1[index2D] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index2D] = SZ_UINT16_MIN;
+				else
+					P1[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index2D] = SZ_UINT16_MIN;
+				else
+					P1[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P1[index2D] = SZ_UINT16_MIN;
+					else
+						P1[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[index2D] = SZ_UINT16_MIN;
+				else
+					P0[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_UINT16_MIN)
+							P0[index2D] = SZ_UINT16_MIN;
+						else
+							P0[index2D] = SZ_UINT16_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt16Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint16_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint16_withinRange(unsigned char** newByteData, uint16_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*2);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 2;
+	tdps->dataTypeSize = convertDataTypeSize(sizeof(uint16_t));
+	
+	uint16_t value = oriData[0];
+	int16ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint16_t)+SZ_SIZE_TYPE; //8==3+1+4(uint16_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint16_wRngeNoGzip(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint16_t minValue = computeRangeSize_int(oriData, SZ_UINT16, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint16(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint16_t minValue = (uint16_t)computeRangeSize_int(oriData, SZ_UINT16, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint16_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_uint32.c b/deps/SZ/sz/src/sz_uint32.c
new file mode 100644
index 0000000000000000000000000000000000000000..6f27510f258fc43388b310808e82ad8c50d4b772
--- /dev/null
+++ b/deps/SZ/sz/src/sz_uint32.c
@@ -0,0 +1,1268 @@
+/**
+ *  @file sz_uint32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint32, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint32.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint32_2D(uint32_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = r1*r2/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint32_3D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint32_4D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint32_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt32Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	uint32_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt32Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint32_StoreOriData(uint32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint32_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int32ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint32_2D_MDQ(uint32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint32_t*)malloc(r2*sizeof(uint32_t));
+	memset(P0, 0, r2*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r2*sizeof(uint32_t));
+	memset(P1, 0, r2*sizeof(uint32_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt32Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint32_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint32_3D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint32_t*)malloc(r23*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r23*sizeof(uint32_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint32_4D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint32_t*)malloc(r34*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r34*sizeof(uint32_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt32Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint32_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint32_withinRange(unsigned char** newByteData, uint32_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 4;
+	
+	uint32_t value = oriData[0];
+	int32ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint32_t)+SZ_SIZE_TYPE; //8==3+1+4(uint32_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint32_wRngeNoGzip(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint32_t minValue = computeRangeSize_int(oriData, SZ_UINT32, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint32(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint32_t minValue = (uint32_t)computeRangeSize_int(oriData, SZ_UINT32, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint32_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_uint64.c b/deps/SZ/sz/src/sz_uint64.c
new file mode 100644
index 0000000000000000000000000000000000000000..7d2eca843f9205a5b3704d49e5da64a67f315fb9
--- /dev/null
+++ b/deps/SZ/sz/src/sz_uint64.c
@@ -0,0 +1,1268 @@
+/**
+ *  @file sz_uint64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint64, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint64.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - (int64_t)(oriData[i]));
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint64_2D(uint64_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint64_3D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint64_4D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t dataLength, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint64_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt64Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt64Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint64_StoreOriData(uint64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint64_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int64ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint64_2D_MDQ(uint64_t *oriData, size_t r1, size_t r2, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint64_t*)malloc(r2*sizeof(uint64_t));
+	memset(P0, 0, r2*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r2*sizeof(uint64_t));
+	memset(P1, 0, r2*sizeof(uint64_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt64Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = (int64_t)(spaceFillingValue[1]) - (int64_t)(pred1D);
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = (int64_t)(spaceFillingValue[j]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint64_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint64_3D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint64_t*)malloc(r23*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r23*sizeof(uint64_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = (int64_t)(spaceFillingValue[1]) - (int64_t)(pred1D);
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = (int64_t)(spaceFillingValue[j]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred3D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint64_4D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint64_t*)malloc(r34*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r34*sizeof(uint64_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = (int64_t)(curValue) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred3D);
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt64Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint64_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint64_withinRange(unsigned char** newByteData, uint64_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 8;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 8;
+	
+	uint64_t value = oriData[0];
+	int64ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint64_t)+SZ_SIZE_TYPE; //8==3+1+4(uint64_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint64_wRngeNoGzip(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint64_t minValue = computeRangeSize_int(oriData, SZ_UINT64, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint64(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint64_t minValue = (uint64_t)computeRangeSize_int(oriData, SZ_UINT64, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint64_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/sz_uint8.c b/deps/SZ/sz/src/sz_uint8.c
new file mode 100644
index 0000000000000000000000000000000000000000..6865564dd9e8304de3bf973227541775e13b80ea
--- /dev/null
+++ b/deps/SZ/sz/src/sz_uint8.c
@@ -0,0 +1,1385 @@
+/**
+ *  @file sz_uint8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint8, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint8.h"
+#include "utility.h"
+
+unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint8_2D(uint8_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint8_3D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint8_4D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint8_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt8Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_UINT8_MAX) pred = SZ_UINT8_MAX;
+			if(pred<SZ_UINT8_MIN) pred = SZ_UINT8_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt8Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint8_StoreOriData(uint8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint8_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			*p = oriData[i];
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint8_2D_MDQ(uint8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint8_t*)malloc(r2*sizeof(uint8_t));
+	memset(P0, 0, r2*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r2*sizeof(uint8_t));
+	memset(P1, 0, r2*sizeof(uint8_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt8Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			P1[1] = SZ_UINT8_MIN;
+		else
+			P1[1] = SZ_UINT8_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[j] = SZ_UINT8_MIN;
+			else
+				P1[j] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P0[0] = SZ_UINT8_MIN;
+			else
+				P0[0] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[j] = SZ_UINT8_MIN;
+				else
+					P0[j] = SZ_UINT8_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint8_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint8_3D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint8_t*)malloc(r23*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r23*sizeof(uint8_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			P1[1] = SZ_UINT8_MIN;
+		else
+			P1[1] = SZ_UINT8_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[j] = SZ_UINT8_MIN;
+			else
+				P1[j] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[index] = SZ_UINT8_MIN;
+			else
+				P1[index] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index] = SZ_UINT8_MIN;
+				else
+					P1[index] = SZ_UINT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P0[0] = SZ_UINT8_MIN;
+			else
+				P0[0] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[j] = SZ_UINT8_MIN;
+				else
+					P0[j] = SZ_UINT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[index2D] = SZ_UINT8_MIN;
+				else
+					P0[index2D] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint8_4D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint8_t*)malloc(r34*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r34*sizeof(uint8_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[index2D] = SZ_UINT8_MIN;
+			else
+				P1[index2D] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index2D] = SZ_UINT8_MIN;
+				else
+					P1[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index2D] = SZ_UINT8_MIN;
+				else
+					P1[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P1[index2D] = SZ_UINT8_MIN;
+					else
+						P1[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[index2D] = SZ_UINT8_MIN;
+				else
+					P0[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_UINT8_MIN)
+							P0[index2D] = SZ_UINT8_MIN;
+						else
+							P0[index2D] = SZ_UINT8_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt8Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint8_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint8_withinRange(unsigned char** newByteData, uint8_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char));
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 1;
+	
+	uint8_t value = oriData[0];
+	//intToBytes_bigEndian(tdps->exactDataBytes, value);
+	memcpy(tdps->exactDataBytes, &value, 1);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint8_t)+SZ_SIZE_TYPE; //8==3+1+4(uint8_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint8_wRngeNoGzip(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint8_t minValue = computeRangeSize_int(oriData, SZ_UINT8, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint8(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint8_t minValue = (uint8_t)computeRangeSize_int(oriData, SZ_UINT8, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint8_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/deps/SZ/sz/src/szd_double.c b/deps/SZ/sz/src/szd_double.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd3cd1c0d96cb5fea76857bfecf621d86dfbe896
--- /dev/null
+++ b/deps/SZ/sz/src/szd_double.c
@@ -0,0 +1,5699 @@
+/**
+ *  @file szd_double.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_double.h"
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_double_pwr.h"
+#include "szd_double_ts.h"
+#include "utility.h"
+
+int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize, int compressionType, double* hist_data)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<3; //i.e., *8
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 12+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;
+	if(cmpSize!=12+4+MetaDataByteLength_double && cmpSize!=12+8+MetaDataByteLength_double)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
+		{
+			if(confparams_dec->losslessCompressor!=-1)
+				confparams_dec->szMode = SZ_BEST_COMPRESSION;
+			else
+				confparams_dec->szMode = SZ_BEST_SPEED;			
+		}
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}	
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION || confparams_dec->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 			
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE);			
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+		
+	confparams_dec->sol_ID = szTmpBytes[4+14]; //szTmpBytes: version(3bytes), samebyte(1byte), [14]:sol_ID=SZ or SZ_Transpose		
+	//TODO: convert szTmpBytes to double array.
+	TightDataPointStorageD* tdps;
+	int errBoundMode = new_TightDataPointStorageD_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+
+	int dim = computeDimension(r5,r4,r3,r2,r1);
+	int doubleSize = sizeof(double);
+	if(tdps->isLossless)
+	{
+		*newData = (double*)malloc(doubleSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE, dataLength*doubleSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=doubleSize)
+				(*newData)[i] = bytesToDouble(p);
+		}		
+	}
+	else if(confparams_dec->sol_ID==SZ_Transpose)
+	{
+		getSnapshotData_double_1D(newData,dataLength,tdps, errBoundMode, 0, hist_data);		
+	}
+	else //confparams_dec->sol_ID==SZ
+	{
+		if(tdps->raBytes_size > 0) //v2.0
+		{
+			if (dim == 1)
+				getSnapshotData_double_1D(newData,r1,tdps, errBoundMode, 0, hist_data);
+			else if(dim == 2)
+				decompressDataSeries_double_2D_nonblocked_with_blocked_regression(newData, r2, r1, tdps->raBytes, hist_data);
+			else if(dim == 3)
+				decompressDataSeries_double_3D_nonblocked_with_blocked_regression(newData, r3, r2, r1, tdps->raBytes, hist_data);
+			else if(dim == 4)
+				decompressDataSeries_double_3D_nonblocked_with_blocked_regression(newData, r4*r3, r2, r1, tdps->raBytes, hist_data);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}	
+		}
+		else //1.4.13 or time-based compression
+		{
+			if (dim == 1)
+				getSnapshotData_double_1D(newData,r1,tdps, errBoundMode, compressionType, hist_data);
+			else
+			if (dim == 2)
+				getSnapshotData_double_2D(newData,r2,r1,tdps, errBoundMode, compressionType, hist_data);
+			else
+			if (dim == 3)
+				getSnapshotData_double_3D(newData,r3,r2,r1,tdps, errBoundMode, compressionType, hist_data);
+			else
+			if (dim == 4)
+				getSnapshotData_double_4D(newData,r4,r3,r2,r1,tdps, errBoundMode, compressionType, hist_data);			
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}			
+		}
+	}	
+
+	if(confparams_dec->protectValueRange)
+	{
+		double* nd = *newData;
+		double min = confparams_dec->dmin;
+		double max = confparams_dec->dmax;		
+		for(i=0;i<dataLength;i++)
+		{
+			double v = nd[i];
+			if(v <= max && v >= min)
+				continue;
+			if(v < min)
+				nd[i] = min;
+			else if(v > max)
+				nd[i] = max;
+		}
+	}
+
+	free_TightDataPointStorageD2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=12+MetaDataByteLength_double+exe_params->SZ_SIZE_TYPE)
+		free(szTmpBytes);	
+	return status;
+}
+
+void decompressDataSeries_double_1D(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+	
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_2D(double** data, size_t r1, size_t r2, double* hist_data, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_3D(double** data, size_t r1, size_t r2, size_t r3, double* hist_data, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, double* hist_data, TightDataPointStorageD* tdps)
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+//	printf ("%d %d %d\n", r1, r2, r3, r4);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[index] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+					}
+					else
+					{
+						// compute resiBits
+						resiBits = 0;
+						if (resiBitsLength != 0) {
+							int kMod8 = k % 8;
+							int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+							if (rightMovSteps > 0) {
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+							} else if (rightMovSteps < 0) {
+								int code1 = getLeftMovingCode(kMod8);
+								int code2 = getRightMovingCode(kMod8, resiBitsLength);
+								int leftMovSteps = -rightMovSteps;
+								rightMovSteps = 8 - leftMovSteps;
+								resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+								p++;
+								resiBits = resiBits
+										| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+							} else // rightMovSteps == 0
+							{
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code);
+								p++;
+							}
+							k += resiBitsLength;
+						}
+
+						// recover the exact data
+						memset(curBytes, 0, 8);
+						leadingNum = leadNum[l++];
+						memcpy(curBytes, preBytes, leadingNum);
+						for (j = leadingNum; j < reqBytesLength; j++)
+							curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+						if (resiBitsLength != 0) {
+							unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+							curBytes[reqBytesLength] = resiByte;
+						}
+
+						exactData = bytesToDouble(curBytes);
+						(*data)[index] = exactData + medianValue;
+						memcpy(preBytes,curBytes,8);
+					}
+				}
+			}
+		}
+	}
+
+//I didn't implement time-based compression for 4D actually. 
+//#ifdef HAVE_TIMECMPR	
+//	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+//		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+//#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+/*MSST19*/
+void decompressDataSeries_double_1D_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	//double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	//decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double exactData, predValue = 0;
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	//float threshold = tdps->minLogValue;
+	double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+	double inv = 2.0-pow(2, -(tdps->plus_bits));
+	for(int i=0; i<intvCapacity; i++){
+		double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,8);
+			predValue = (*data)[i];
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			//predValue = (*data)[i-1];
+			predValue = fabs(predValue) * precisionTable[type_];
+			(*data)[i] = predValue;
+			break;
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+	free(precisionTable);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_2D_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+
+	unsigned char* leadNum;
+	//double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+    int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	//decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double exactData;
+	int type_;
+
+    double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+    double inv = 2.0-pow(2, -(tdps->plus_bits));
+    for(int i=0; i<intvCapacity; i++){
+        double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+
+    reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	//medianValue = tdps->medianValue;
+	
+	double pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = fabs(pred1D) * precisionTable[type_];
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[jj-1] * (*data)[jj-1] / (*data)[jj-2];
+			(*data)[jj] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] * (*data)[index-r2] / (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabs(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_3D_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	unsigned char* leadNum;
+	//double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+	double inv = 2.0-pow(2, -(tdps->plus_bits));
+	for(int i=0; i<intvCapacity; i++){
+		double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	//decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	double exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	
+	double pred1D, pred2D, pred3D;
+	double temp;
+	double temp2;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = fabs(pred1D) * precisionTable[type_];
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData;
+		memcpy(preBytes,curBytes,8);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		temp = (*data)[jj-1];
+		pred1D = temp * ( *data)[jj-1] / (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			temp = (*data)[index-1];
+			pred2D = temp * (*data)[index-r3] / (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabs(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			temp = (*data)[index-1];
+			pred2D = temp * (*data)[index-r23] / (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabs(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			temp = (*data)[index-r3];
+			pred2D = temp * (*data)[index-r23] / (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabs(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				//pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+				//	- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+				temp = (*data)[index-1];
+				temp2 = (*data)[index-r3-1];
+				pred3D = temp * (*data)[index-r3] * (*data)[index-r23] * (*data)[index-r23-r3-1] / (temp2 * (*data)[index-r23-r3] * (*data)[index-r23-1]);
+
+				type_ = type[index];				
+				if (type_ != 0)
+				{
+					(*data)[index] = fabs(pred3D) * precisionTable[type_];	
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif		
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data) 
+{
+	size_t i;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0) //snapshot
+						decompressDataSeries_double_1D(data, dataSeriesLength, hist_data, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif
+					decompressDataSeries_double_1D(data, dataSeriesLength, hist_data, tdps);
+			}
+			else 
+			{
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_double_1D_pwr_pre_log_MSST19(data, dataSeriesLength, tdps);
+				else
+					decompressDataSeries_double_1D_pwr_pre_log(data, dataSeriesLength, tdps);
+				//decompressDataSeries_double_1D_pwrgroup(data, dataSeriesLength, tdps);
+			}
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data)  
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0) //snapshot
+						decompressDataSeries_double_2D(data, r1, r2, hist_data, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif						
+					decompressDataSeries_double_2D(data, r1, r2, hist_data, tdps);
+			}
+			else 
+				//decompressDataSeries_double_2D_pwr(data, r1, r2, tdps);
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_double_2D_pwr_pre_log_MSST19(data, r1, r2, tdps);
+				else
+					decompressDataSeries_double_2D_pwr_pre_log(data, r1, r2, tdps);
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0) //snapshot
+						decompressDataSeries_double_3D(data, r1, r2, r3, hist_data, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif						
+					decompressDataSeries_double_3D(data, r1, r2, r3, hist_data, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_double_3D_pwr(data, r1, r2, r3, tdps);
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_double_3D_pwr_pre_log_MSST19(data, r1, r2, r3, tdps);
+				else
+					decompressDataSeries_double_3D_pwr_pre_log(data, r1, r2, r3, tdps);
+			}	
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0)
+						decompressDataSeries_double_4D(data, r1, r2, r3, r4, hist_data, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, r1*r2*r3*r4, hist_data, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_double_4D(data, r1, r2, r3, r4, hist_data, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_double_3D_pwr(data, r1*r2, r3, r4, tdps);
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_double_3D_pwr_pre_log_MSST19(data, r1*r2, r3, r4, tdps);
+				else
+					decompressDataSeries_double_3D_pwr_pre_log(data, r1*r2, r3, r4, tdps);
+			}					
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+size_t decompressDataSeries_double_3D_RA_block(double * data, double mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, double * unpredictable_data)
+{
+	int intvRadius = exe_params->intvRadius;
+	
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	double * cur_data_pos = data;
+	double * last_row_pos;
+	double pred1D, pred2D, pred3D;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = last_row_pos[0];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = cur_data_pos[- dim0_offset];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = cur_data_pos[j-1] + cur_data_pos[j - dim0_offset] - cur_data_pos[j - 1 - dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+
+	    /* Process Row-1 --> Row-r2-1 */
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			pred2D = last_row_pos[0] + cur_data_pos[- dim0_offset] - last_row_pos[- dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[0] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				pred3D = cur_data_pos[j-1] + last_row_pos[j]+ cur_data_pos[j - dim0_offset] - last_row_pos[j-1] - last_row_pos[j - dim0_offset] - cur_data_pos[j-1 - dim0_offset] + last_row_pos[j-1 - dim0_offset];
+				type_ = type[index];
+				if (type_ != 0){
+					cur_data_pos[j] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else{
+					cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+				}
+			}
+			last_row_pos = cur_data_pos;
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+	}
+
+	return unpredictable_count;
+}
+
+void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data, double* hist_data){
+
+	size_t dim0_offset = r2;
+	size_t num_elements = r1 * r2;
+
+	*data = (double*)malloc(sizeof(double)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t num_blocks = num_x * num_y;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	double mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(double));
+	comp_data_pos += sizeof(double);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+	//printf("reg_count: %ld\n", reg_count);
+
+	int coeff_intvRadius[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	int * coeff_type[3];
+	double precision[3];
+	double * coeff_unpred_data[3];
+	if(reg_count > 0){
+		for(int i=0; i<3; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (double *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	double last_coefficients[3] = {0.0};
+	int coeff_unpred_data_count[3] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	double * unpred_data = (double *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(double);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = intervals/2;
+	
+	int * type;
+
+	double * data_pos = *data;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+	size_t cur_unpred_count;
+
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+
+					double * block_data_pos = data_pos;
+					double pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					double d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == intvRadius){
+								*block_data_pos = mean;
+							}
+							else if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								if(type_ < intvRadius) type_ += 1;
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						double pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						double * block_data_pos = data_pos;
+						double pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+					
+					double * block_data_pos = data_pos;
+					double pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					double d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						double pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						double * block_data_pos = data_pos;
+						double pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), num_elements*sizeof(double));
+#endif	
+	
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+
+void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, double* hist_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (double*)malloc(sizeof(double)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+4, nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	double mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(double));
+	comp_data_pos += sizeof(double);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	double * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (double *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	double last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	double * unpred_data = (double *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(double);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = intervals/2;
+	
+	int * type;
+	double * data_pos = *data;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		// type = result_type;
+
+		// for(size_t i=0; i<num_x; i++){
+		// 	for(size_t j=0; j<num_y; j++){
+		// 		for(size_t k=0; k<num_z; k++){
+		// 			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		// 			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		// 			offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		// 			data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		// 			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		// 			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		// 			current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+		// 			// type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		// 			// type = result_type + type_offset;
+		// 			size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		// 			// index = i * num_y * num_z + j * num_z + k;
+
+		// 			// printf("i j k: %ld %ld %ld\toffset: %ld %ld %ld\tindicator: %ld\n", i, j, k, offset_x, offset_y, offset_z, indicator[index]);
+		// 			if(*indicator_pos){
+		// 				// decompress by SZ
+		// 				// cur_unpred_count = decompressDataSeries_double_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+		// 				double * block_data_pos = data_pos;
+		// 				double pred;
+		// 				size_t index = 0;
+		// 				int type_;
+		// 				// d111 is current data
+		// 				size_t unpredictable_count = 0;
+		// 				double d000, d001, d010, d011, d100, d101, d110;
+		// 				for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 					for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 						for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 							type_ = type[index];
+		// 							if(type_ == intvRadius){
+		// 								*block_data_pos = mean;
+		// 							}
+		// 							else if(type_ == 0){
+		// 								*block_data_pos = unpred_data[unpredictable_count ++];
+		// 							}
+		// 							else{
+		// 								d000 = d001 = d010 = d011 = d100 = d101 = d110 = 1;
+		// 								if(i == 0 && ii == 0){
+		// 									d000 = d001 = d010 = d011 = 0;
+		// 								}
+		// 								if(j == 0 && jj == 0){
+		// 									d000 = d001 = d100 = d101 = 0;
+		// 								}
+		// 								if(k == 0 && kk == 0){
+		// 									d000 = d010 = d100 = d110 = 0;
+		// 								}
+		// 								if(d000){
+		// 									d000 = block_data_pos[- dim0_offset - dim1_offset - 1];
+		// 								}
+		// 								if(d001){
+		// 									d001 = block_data_pos[- dim0_offset - dim1_offset];
+		// 								}
+		// 								if(d010){
+		// 									d010 = block_data_pos[- dim0_offset - 1];
+		// 								}
+		// 								if(d011){
+		// 									d011 = block_data_pos[- dim0_offset];
+		// 								}
+		// 								if(d100){
+		// 									d100 = block_data_pos[- dim1_offset - 1];
+		// 								}
+		// 								if(d101){
+		// 									d101 = block_data_pos[- dim1_offset];
+		// 								}
+		// 								if(d110){
+		// 									d110 = block_data_pos[- 1];
+		// 								}
+		// 								if(type_ < intvRadius) type_ += 1;
+		// 								pred = d110 + d101 + d011 - d100 - d010 - d001 + d000;
+		// 								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 							}
+		// 							index ++;
+		// 							block_data_pos ++;
+		// 						}
+		// 						block_data_pos += dim1_offset - current_blockcount_z;
+		// 					}
+		// 					block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 				}
+		// 				cur_unpred_count = unpredictable_count;
+		// 			}
+		// 			else{
+		// 				// decompress by regression
+		// 				{
+		// 					//restore regression coefficients
+		// 					double pred;
+		// 					int type_;
+		// 					for(int e=0; e<4; e++){
+		// 						// if(i == 0 && j == 0 && k == 19){
+		// 						// 	printf("~\n");
+		// 						// }
+		// 						type_ = coeff_type[e][coeff_index];
+		// 						if (type_ != 0){
+		// 							pred = last_coefficients[e];
+		// 							last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+		// 						}
+		// 						else{
+		// 							last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+		// 							coeff_unpred_data_count[e] ++;
+		// 						}
+		// 						if(fabs(last_coefficients[e]) > 10000){
+		// 							printf("%d %d %d-%d: pred %.4f type %d precision %.4g last_coefficients %.4g\n", i, j, k, e, pred, type_, precision[e], last_coefficients[e]);
+		// 							exit(0);
+		// 						}
+		// 					}
+		// 					coeff_index ++;
+		// 				}
+		// 				{
+		// 					double * block_data_pos = data_pos;
+		// 					double pred;
+		// 					int type_;
+		// 					size_t index = 0;
+		// 					size_t unpredictable_count = 0;
+		// 					for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 						for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 							for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 								if(block_data_pos - (*data) == 19470788){
+		// 									printf("dec stop\n");
+		// 								}
+
+		// 								type_ = type[index];
+		// 								if (type_ != 0){
+		// 									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+		// 									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 								}
+		// 								else{
+		// 									*block_data_pos = unpred_data[unpredictable_count ++];
+		// 								}
+		// 								index ++;	
+		// 								block_data_pos ++;
+		// 							}
+		// 							block_data_pos += dim1_offset - current_blockcount_z;
+		// 						}
+		// 						block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 					}
+		// 					cur_unpred_count = unpredictable_count;
+		// 				}
+		// 			}
+
+		// 			type += current_block_elements;
+		// 			indicator_pos ++;
+		// 			unpred_data += cur_unpred_count;
+		// 			// decomp_unpred += cur_unpred_count;
+		// 			// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+		// 			// fflush(stdout);
+		// 		}
+		// 	}
+		// }
+
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	else{
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), num_elements*sizeof(double));
+#endif	
+
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
diff --git a/deps/SZ/sz/src/szd_double_pwr.c b/deps/SZ/sz/src/szd_double_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa9bb96ab8196a03c5b3c02b134b0e448306f5ba
--- /dev/null
+++ b/deps/SZ/sz/src/szd_double_pwr.c
@@ -0,0 +1,1530 @@
+/**
+ *  @file szd_double_pwr.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Feb, 2019
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageD.h"
+#include "CompressElement.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "sz_double_pwr.h"
+#include "utility.h"
+//#include "rw.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+void decompressDataSeries_double_1D_pwr(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	unsigned char tmpPrecBytes[8] = {0}; //used when needing to convert bytes to double values
+	unsigned char* bp = tdps->pwrErrBoundBytes;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+
+	unsigned char* leadNum;
+	double interval = 0;// = (double)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength = 0, reqBytesLength = 0, resiBitsLength = 0, resiBits = 0; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue = 0, realPrecision = 0;
+	
+	medianValue = tdps->medianValue;
+	
+	int type_, updateReqLength = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		if(i%tdps->segment_size==0)
+		{
+			tmpPrecBytes[0] = *(bp++);
+			tmpPrecBytes[1] = *(bp++);
+			memset(&tmpPrecBytes[2], 0, 6*sizeof(unsigned char));
+
+			realPrecision = bytesToDouble(tmpPrecBytes);
+			interval = realPrecision*2;
+			updateReqLength = 0;
+		}
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(leadNum);
+	free(type);
+	return;
+}
+
+double* extractRealPrecision_2D_double(size_t R1, size_t R2, int blockSize, TightDataPointStorageD* tdps)
+{
+	size_t i,j,k=0, I;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[8] = {0};
+	double* result = (double*)malloc(sizeof(double)*R1*R2);
+	for(i=0;i<R1;i++)
+	{
+		I = i*R2;
+		for(j=0;j<R2;j++)
+		{
+			tmpBytes[0] = bytes[k++];
+			tmpBytes[1] = bytes[k++];
+			result[I+j]=bytesToDouble(tmpBytes);
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_double_2D_pwr(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, realPrecision;
+	int type_;
+	double pred1D, pred2D;
+	size_t ii, jj, II = 0, JJ = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_2D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	double* pwrErrBound = extractRealPrecision_2D_double(R1, R2, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];		
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			II = 0;
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		
+		type_ = type[jj];
+		if (type_ != 0)
+		{			
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}			
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		if(ii%blockSize==0)
+			II++;
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R2+JJ];				
+		updateReqLength = 0;
+
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R2+JJ];			
+			updateReqLength = 0;
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}						
+				
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageD* tdps)
+{
+	size_t i,j,k=0, IR, JR, p = 0;
+	size_t R23 = R2*R3;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	double* result = (double*)malloc(sizeof(double)*R1*R2*R3);
+	for(i=0;i<R1;i++)
+	{
+		IR = i*R23;
+		for(j=0;j<R2;j++)
+		{
+			JR = j*R3;
+			for(k=0;k<R3;k++)
+			{
+				tmpBytes[0] = bytes[p++];
+				tmpBytes[1] = bytes[p++];
+				result[IR+JR+k]=bytesToDouble(tmpBytes);				
+			}
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;
+	double medianValue, exactData, realPrecision;
+	int type_;
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, II = 0, JJ = 0, KK = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_3D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	size_t R3 = 1+(r3-1)/blockSize;
+	size_t R23 = R2*R3;
+	double* pwrErrBound = extractRealPrecision_3D_double(R1, R2, R3, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			KK = 0;//dimension 1 (top)
+			II = 0;//dimension 2 (mid)
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;		
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R3+JJ];
+		updateReqLength = 0;		
+
+		index = ii*r3;
+		
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r3];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R3+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;		
+		if(kk%blockSize==0)
+			KK++;
+		II = 0;
+		JJ = 0;
+
+		realPrecision = pwrErrBound[KK*R23];			
+		updateReqLength = 0;			
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r23];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+
+			realPrecision = pwrErrBound[KK*R23+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];			
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+			
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			
+			if(ii%blockSize==0)
+				II++;
+			JJ = 0;
+			
+			realPrecision = pwrErrBound[KK*R23+II*R3];			
+			updateReqLength = 0;						
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				if(jj%blockSize==0)
+					JJ++;
+
+				realPrecision = pwrErrBound[KK*R23+II*R3+JJ];			
+				updateReqLength = 0;				
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];					
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					if(updateReqLength==0)
+					{
+						computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;				
+						updateReqLength = 1;
+					}
+				
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	double *posGroups, *negGroups, *groups;
+	double pos_01_group, neg_01_group;
+	int *posFlags, *negFlags;
+	
+	updateQuantizationInfo(tdps->intervals);
+	
+	unsigned char* leadNum;
+	double interval;// = (double)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	createRangeGroups_double(&posGroups, &negGroups, &posFlags, &negFlags);
+	
+	double realGroupPrecision;
+	double realPrecision = tdps->realPrecision;
+	char* groupID = decompressGroupIDArray(tdps->pwrErrBoundBytes, tdps->dataSeriesLength);
+	
+	//note that the groupID values here are [1,2,3,....,18] or [-1,-2,...,-18]
+	
+	double* groupErrorBounds = generateGroupErrBounds(confparams_dec->errorBoundMode, realPrecision, confparams_dec->pw_relBoundRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+		
+	size_t nbBins = (size_t)(1/confparams_dec->pw_relBoundRatio + 0.5);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength = 0, resiBitsLength = 0, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, curValue, predValue;
+	
+	medianValue = tdps->medianValue;
+	
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+							// in resiMidBits, p is to track the
+							// byte_index of resiMidBits, l is for
+							// leadNum
+							
+	int type_, updateReqLength = 0;
+	char rawGrpID = 0, indexGrpID = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		rawGrpID = groupID[i];
+		
+		if(rawGrpID >= 2)
+		{
+			groups = posGroups;
+			indexGrpID = rawGrpID - 2;
+		}
+		else if(rawGrpID <= -2)
+		{
+			groups = negGroups;
+			indexGrpID = -rawGrpID - 2;		}
+		else if(rawGrpID == 1)
+		{
+			groups = &pos_01_group;
+			indexGrpID = 0;
+		}
+		else //rawGrpID == -1
+		{
+			groups = &neg_01_group;
+			indexGrpID = 0;			
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			exactData = exactData + medianValue;
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,8);
+			
+			groups[indexGrpID] = exactData;
+			
+			break;
+		default:
+			predValue = groups[indexGrpID]; //Here, groups[indexGrpID] is the previous value.
+			realGroupPrecision = groupErrorBounds[indexGrpID];
+			interval = realGroupPrecision*2;		
+			
+			curValue = predValue + (type_-exe_params->intvRadius)*interval;
+			
+			//groupNum = computeGroupNum_double(curValue);
+			
+			if((curValue>0&&rawGrpID<0)||(curValue<0&&rawGrpID>0))
+				curValue = 0;
+			//else
+			//{
+			//	realGrpID = fabs(rawGrpID)-2;
+			//	if(groupNum<realGrpID)
+			//		curValue = rawGrpID>0?pow(2,realGrpID):-pow(2,realGrpID);
+			//	else if(groupNum>realGrpID)
+			//		curValue = rawGrpID>0?pow(2,groupNum):-pow(2,groupNum);				
+			//}	
+				
+			(*data)[i] = curValue;
+			groups[indexGrpID] = curValue;
+			break;		
+		}
+	}	
+	
+	free(leadNum);
+	free(type);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupErrorBounds);
+	free(groupID);
+}
+
+void decompressDataSeries_double_1D_pwr_pre_log(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) {
+
+	decompressDataSeries_double_1D(data, dataSeriesLength, NULL, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_double_2D_pwr_pre_log(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_double_2D(data, r1, r2, NULL, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
+
+void decompressDataSeries_double_3D_pwr_pre_log(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_double_3D(data, r1, r2, r3, NULL, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
+
+void decompressDataSeries_double_1D_pwr_pre_log_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	decompressDataSeries_double_1D_MSST19(data, dataSeriesLength, tdps);
+	double threshold = tdps->minLogValue;
+	uint64_t* ptr;
+
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs = NULL;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0){
+				(*data)[i] = 0;
+				continue;
+			}
+			if(signs[i]){
+			    ptr = (uint64_t*)(*data) + i;
+                *ptr |= 0x8000000000000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+void decompressDataSeries_double_2D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_double_2D_MSST19(data, r1, r2, tdps);
+	double threshold = tdps->minLogValue;
+	uint64_t* ptr;
+
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs = NULL;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0){
+				(*data)[i] = 0;
+				continue;
+			}
+			if(signs[i]){
+			    ptr = (uint64_t*)(*data) + i;
+                *ptr |= 0x8000000000000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+void decompressDataSeries_double_3D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_double_3D_MSST19(data, r1, r2, r3, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs = NULL;
+		uint64_t* ptr;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0) {
+			    (*data)[i] = 0;
+                continue;
+			}
+			if(signs[i]) {
+			    ptr = (uint64_t*)(*data)+i;
+			    *ptr |= 0x8000000000000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+#pragma GCC diagnostic pop
diff --git a/deps/SZ/sz/src/szd_double_ts.c b/deps/SZ/sz/src/szd_double_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a438a10df5e57ffdfccefddbf517c5569c9d655
--- /dev/null
+++ b/deps/SZ/sz/src/szd_double_ts.c
@@ -0,0 +1,114 @@
+/**
+ *  @file szd_double_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_double.h"
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_double_ts.h"
+
+void decompressDataSeries_double_1D_ts(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps) 
+{
+	double* lastSnapshotData = hist_data;
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue = 0;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = (*data)[i-1];
+			if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				predValue = lastSnapshotData[i];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+	memcpy(hist_data, (*data), dataSeriesLength*sizeof(double));
+	
+	free(leadNum);
+	free(type);
+	return;
+}
diff --git a/deps/SZ/sz/src/szd_float.c b/deps/SZ/sz/src/szd_float.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c4a1f12aef18eb6a34497eca34118f291c94759
--- /dev/null
+++ b/deps/SZ/sz/src/szd_float.c
@@ -0,0 +1,7720 @@
+/**
+ *  @file szd_float.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Aug, 2018
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_float.h"
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_float_pwr.h"
+#include "szd_float_ts.h"
+#include "utility.h"
+
+
+//struct timeval startTime_;
+//struct timeval endTime_;  /* Start and end times */
+//struct timeval costStart_; /*only used for recording the cost*/
+//double totalCost_ = 0;
+
+/*void cost_start_()
+{
+	totalCost_ = 0;
+	gettimeofday(&costStart_, NULL);
+}
+
+void cost_end_()
+{
+	double elapsed;
+	struct timeval costEnd;
+	gettimeofday(&costEnd, NULL);
+	elapsed = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(costStart_.tv_sec*1000000+costStart_.tv_usec))/1000000.0;
+	totalCost_ += elapsed;
+}*/
+
+
+/**
+ * 
+ * int compressionType: 1 (time-based compression) ; 0 (space-based compression)
+ * hist_data: only valid when compressionType==1, hist_data is the historical dataset such as the data in previous time step
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize, int compressionType, float* hist_data)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+	
+	if(cmpSize!=8+4+MetaDataByteLength && cmpSize!=8+8+MetaDataByteLength) //4,8 means two posibilities of SZ_SIZE_TYPE
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
+		{
+			if(confparams_dec->losslessCompressor!=-1)
+				confparams_dec->szMode = SZ_BEST_COMPRESSION;
+			else
+				confparams_dec->szMode = SZ_BEST_SPEED;			
+		}
+		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION || confparams_dec->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;	
+		
+	confparams_dec->sol_ID = szTmpBytes[4+14]; //szTmpBytes: version(3bytes), samebyte(1byte), [14]:sol_ID=SZ or SZ_Transpose
+		
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageF* tdps;
+	int errBoundMode = new_TightDataPointStorageF_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int floatSize = sizeof(float);
+	if(tdps->isLossless)
+	{
+		*newData = (float*)malloc(floatSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*floatSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=floatSize)
+				(*newData)[i] = bytesToFloat(p);
+		}		
+	}
+	else if(confparams_dec->sol_ID==SZ_Transpose)
+	{
+		getSnapshotData_float_1D(newData,dataLength,tdps, errBoundMode, 0, hist_data);		
+	}
+	else //confparams_dec->sol_ID==SZ
+	{
+		if(tdps->raBytes_size > 0) //v2.0
+		{
+			if (dim == 1)
+				getSnapshotData_float_1D(newData,r1,tdps, errBoundMode, 0, hist_data);
+			else if(dim == 2)
+				decompressDataSeries_float_2D_nonblocked_with_blocked_regression(newData, r2, r1, tdps->raBytes, hist_data);
+			else if(dim == 3)
+				decompressDataSeries_float_3D_nonblocked_with_blocked_regression(newData, r3, r2, r1, tdps->raBytes, hist_data);
+			else if(dim == 4)
+				decompressDataSeries_float_3D_nonblocked_with_blocked_regression(newData, r4*r3, r2, r1, tdps->raBytes, hist_data);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}	
+		}
+		else //1.4.13 or time-based compression
+		{
+			if (dim == 1)
+				getSnapshotData_float_1D(newData,r1,tdps, errBoundMode, compressionType, hist_data);
+			else if (dim == 2)
+				getSnapshotData_float_2D(newData,r2,r1,tdps, errBoundMode, compressionType, hist_data);
+			else if (dim == 3)
+				getSnapshotData_float_3D(newData,r3,r2,r1,tdps, errBoundMode, compressionType, hist_data);
+			else if (dim == 4)
+				getSnapshotData_float_4D(newData,r4,r3,r2,r1,tdps, errBoundMode, compressionType, hist_data);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}			
+		}
+	}
+
+	//cost_start_();	
+	if(confparams_dec->protectValueRange)
+	{
+		float* nd = *newData;
+		float min = confparams_dec->fmin;
+		float max = confparams_dec->fmax;		
+		for(i=0;i<dataLength;i++)
+		{
+			float v = nd[i];
+			if(v <= max && v >= min)
+				continue;
+			if(v < min)
+				nd[i] = min;
+			else if(v > max)
+				nd[i] = max;
+		}
+	}
+	//cost_end_();
+	//printf("totalCost_=%f\n", totalCost_);
+	free_TightDataPointStorageF2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
+		free(szTmpBytes);
+	return status;
+}
+
+void decompressDataSeries_float_1D(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	float interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {	
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (float)(type_-intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+	
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_2D(float** data, size_t r1, size_t r2, float* hist_data, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+	float realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	float pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_3D(float** data, size_t r1, size_t r2, size_t r3, float* hist_data, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	unsigned char* leadNum;
+	float realPrecision = tdps->realPrecision;
+
+	//TODO
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif		
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, float* hist_data, TightDataPointStorageF* tdps)
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+//	printf ("%d %d %d %d\n", r1, r2, r3, r4);
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		exactData = bytesToFloat(curBytes);
+		(*data)[index] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+					}
+					else
+					{
+						// compute resiBits
+						resiBits = 0;
+						if (resiBitsLength != 0) {
+							int kMod8 = k % 8;
+							int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+							if (rightMovSteps > 0) {
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+							} else if (rightMovSteps < 0) {
+								int code1 = getLeftMovingCode(kMod8);
+								int code2 = getRightMovingCode(kMod8, resiBitsLength);
+								int leftMovSteps = -rightMovSteps;
+								rightMovSteps = 8 - leftMovSteps;
+								resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+								p++;
+								resiBits = resiBits
+										| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+							} else // rightMovSteps == 0
+							{
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code);
+								p++;
+							}
+							k += resiBitsLength;
+						}
+
+						// recover the exact data
+						memset(curBytes, 0, 4);
+						leadingNum = leadNum[l++];
+						memcpy(curBytes, preBytes, leadingNum);
+						for (j = leadingNum; j < reqBytesLength; j++)
+							curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+						if (resiBitsLength != 0) {
+							unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+							curBytes[reqBytesLength] = resiByte;
+						}
+
+						exactData = bytesToFloat(curBytes);
+						(*data)[index] = exactData + medianValue;
+						memcpy(preBytes,curBytes,4);
+					}
+				}
+			}
+
+		}
+	}
+
+//I didn't implement time-based compression for 4D actually. 
+//#ifdef HAVE_TIMECMPR	
+//	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+//		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+//#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+/*MSST19*/
+void decompressDataSeries_float_1D_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	//double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	SZ_ReleaseHuffman(huffmanTree);	
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float exactData, predValue = 0;
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	//float threshold = tdps->minLogValue;
+	double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+	double inv = 2.0-pow(2, -(tdps->plus_bits));
+	for(int i=0; i<intvCapacity; i++){
+		double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,4);
+			predValue = (*data)[i];
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			//predValue = (*data)[i-1];
+			predValue = fabs(predValue) * precisionTable[type_];			
+			(*data)[i] = predValue;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+	free(precisionTable);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_2D_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+
+	unsigned char* leadNum;
+	//double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+    int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float exactData;
+	int type_;
+
+    double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+    double inv = 2.0-pow(2, -(tdps->plus_bits));
+    for(int i=0; i<intvCapacity; i++){
+        double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+        precisionTable[i] = test;
+    }
+
+    reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	
+	float pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = fabs(pred1D) * precisionTable[type_];
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData;
+		memcpy(preBytes,curBytes,4);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[jj-1] * (*data)[jj-1] / (*data)[jj-2];
+			(*data)[jj] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = fabs(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] * (*data)[index-r2] / (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabs(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_3D_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) 
+{
+	//updateQuantizationInfo(tdps->intervals);
+	int intvRadius = tdps->intervals/2;
+	int intvCapacity = tdps->intervals;
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	unsigned char* leadNum;
+	//double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	double* precisionTable = (double*)malloc(sizeof(double) * intvCapacity);
+	double inv = 2.0-pow(2, -(tdps->plus_bits));
+	for(int i=0; i<intvCapacity; i++){
+		double test = pow((1+tdps->realPrecision), inv*(i - intvRadius));
+		precisionTable[i] = test;
+	}
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree_MSST19(huffmanTree, tdps->typeArray, dataSeriesLength, type, tdps->max_bits);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	float exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	
+	float pred1D, pred2D, pred3D;
+	double temp;
+	double temp2;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = fabs(pred1D) * precisionTable[type_];
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData;
+		memcpy(preBytes,curBytes,4);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		temp = (*data)[jj-1];
+		pred1D = temp * ( *data)[jj-1] / (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = fabsf(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = fabsf(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			temp = (*data)[index-1];
+			pred2D = temp * (*data)[index-r3] / (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+			    //float ppp = precisionTable[type_];
+			    //float test = fabsf(pred2D) * precisionTable[type_];
+				(*data)[index] = fabsf(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = fabsf(pred1D) * precisionTable[type_];
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			temp = (*data)[index-1];
+			pred2D = temp * (*data)[index-r23] / (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabsf(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			temp = (*data)[index-r3];
+			pred2D = temp * (*data)[index-r23] / (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = fabsf(pred2D) * precisionTable[type_];
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				//pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+				//	- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+				temp = (*data)[index-1];
+				temp2 = (*data)[index-r3-1];
+				pred3D = temp * (*data)[index-r3] * (*data)[index-r23] * (*data)[index-r23-r3-1] / (temp2 * (*data)[index-r23-r3] * (*data)[index-r23-1]);
+
+				type_ = type[index];				
+				if (type_ != 0)
+				{
+					(*data)[index] = fabsf(pred3D) * precisionTable[type_];
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif		
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0) //snapshot
+						decompressDataSeries_float_1D(data, dataSeriesLength, hist_data, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_1D(data, dataSeriesLength, hist_data, tdps);
+			}
+			else 
+			{
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_float_1D_pwr_pre_log_MSST19(data, dataSeriesLength, tdps);
+				else
+					decompressDataSeries_float_1D_pwr_pre_log(data, dataSeriesLength, tdps);
+				//decompressDataSeries_float_1D_pwrgroup(data, dataSeriesLength, tdps);
+			}
+			return;
+		} else { //the special version supporting one value to reserve
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0)
+						decompressDataSeries_float_2D(data, r1, r2, hist_data, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif
+					decompressDataSeries_float_2D(data, r1, r2, hist_data, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_float_2D_pwr(data, r1, r2, tdps);
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_float_2D_pwr_pre_log_MSST19(data, r1, r2, tdps);
+				else
+					decompressDataSeries_float_2D_pwr_pre_log(data, r1, r2, tdps);
+			}			
+
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0)
+						decompressDataSeries_float_3D(data, r1, r2, r3, hist_data, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, dataSeriesLength, hist_data, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_3D(data, r1, r2, r3, hist_data, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_float_3D_pwr(data, r1, r2, r3, tdps);
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_float_3D_pwr_pre_log_MSST19(data, r1, r2, r3, tdps);
+				else
+					decompressDataSeries_float_3D_pwr_pre_log(data, r1, r2, r3, tdps);
+			}					
+			
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(compressionType == 0)
+						decompressDataSeries_float_4D(data, r1, r2, r3, r4, hist_data, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, r1*r2*r3*r4, hist_data, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_4D(data, r1, r2, r3, r4, hist_data, tdps);
+			}
+			else 
+			{
+				if(confparams_dec->accelerate_pw_rel_compression)
+					decompressDataSeries_float_3D_pwr_pre_log_MSST19(data, r1*r2, r3, r4, tdps);
+				else
+					decompressDataSeries_float_3D_pwr_pre_log(data, r1*r2, r3, r4, tdps);
+				//decompressDataSeries_float_4D_pwr(data, r1, r2, r3, r4, tdps);
+			}					
+			return;
+		} else {
+			//TODO
+		}
+	}
+}
+
+size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data){
+	int intvRadius = exe_params->intvRadius;
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+	// printf("SZ_compress_float_3D_MDQ_RA_block real dim: %d %d %d\n", real_block_dims[0], real_block_dims[1], real_block_dims[2]);
+	// fflush(stdout);
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	float * cur_data_pos = data;
+	float * last_row_pos;
+	float pred1D, pred2D, pred3D;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim1_offset;
+	// printf("SZ_compress_float_3D_MDQ_RA_block row 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = last_row_pos[0];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	// printf("SZ_compress_float_3D_MDQ_RA_block layer 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+	// exit(0);
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		// if(idx == 63 && idy == 63 && idz == 63){
+		// 	printf("SZ_compress_float_3D_MDQ_RA_block layer %d done, cur_data_pos: %ld\n", k-1, cur_data_pos - data);
+		// 	fflush(stdout);
+		// }
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = cur_data_pos[- dim0_offset];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = cur_data_pos[j-1] + cur_data_pos[j - dim0_offset] - cur_data_pos[j - 1 - dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], cur_data_pos[j - dim0_offset], cur_data_pos[j - 1 - dim0_offset], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+
+		// if(idx == 63 && idy == 63 && idz == 63){
+		// 	printf("SZ_compress_float_3D_MDQ_RA_block layer row 0 done, cur_data_pos: %ld\n", k-1, cur_data_pos - data);
+		// 	fflush(stdout);
+		// }
+
+	    /* Process Row-1 --> Row-r2-1 */
+		for (i = 1; i < r2; i++)
+		{
+			// if(idx == 63 && idy == 63 && idz == 63){
+			// 	printf("SZ_compress_float_3D_MDQ_RA_block layer row %d done, cur_data_pos: %ld\n", i-1, cur_data_pos - data);
+			// 	fflush(stdout);
+			// }
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			pred2D = last_row_pos[0] + cur_data_pos[- dim0_offset] - last_row_pos[- dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[0] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				pred3D = cur_data_pos[j-1] + last_row_pos[j]+ cur_data_pos[j - dim0_offset] - last_row_pos[j-1] - last_row_pos[j - dim0_offset] - cur_data_pos[j-1 - dim0_offset] + last_row_pos[j-1 - dim0_offset];
+				type_ = type[index];
+				if (type_ != 0){
+					cur_data_pos[j] = pred3D + 2 * (type_ - intvRadius) * realPrecision;
+				}
+				else{
+					cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+				}
+			}
+			last_row_pos = cur_data_pos;
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+	}
+
+	return unpredictable_count;
+}
+
+size_t decompressDataSeries_float_1D_RA_block(float * data, float mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data){
+	int intvRadius = exe_params->intvRadius;
+	size_t unpredictable_count = 0;
+	
+	float * cur_data_pos = data;
+	size_t type_index = 0;
+	int type_;
+	float last_over_thres = mean;
+	for(size_t i=0; i<block_dim_0; i++){
+		type_ = type[type_index];
+		if(type_ == 0){
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			last_over_thres = cur_data_pos[0];
+		}
+		else{
+			cur_data_pos[0] = last_over_thres + 2 * (type_ - intvRadius) * realPrecision;
+			last_over_thres = cur_data_pos[0];
+		}
+
+		type_index ++;
+		cur_data_pos ++;
+	}
+
+	return unpredictable_count;
+}
+
+size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, int * type, float * unpredictable_data){
+	int intvRadius = exe_params->intvRadius;
+	size_t dim0_offset = dim_1;
+	// printf("SZ_compress_float_3D_MDQ_RA_block real dim: %d %d %d\n", real_block_dims[0], real_block_dims[1], real_block_dims[2]);
+	// fflush(stdout);
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+
+	float * cur_data_pos = data;
+	float * last_row_pos;
+	float pred1D, pred2D;
+	size_t i, j;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r2; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim0_offset;
+	// printf("SZ_compress_float_3D_MDQ_RA_block row 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r2;	
+		type_ = type[index];
+		if (type_ != 0){
+			pred1D = last_row_pos[0];
+			cur_data_pos[0] = pred1D + 2 * (type_ - intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim0_offset;
+	}
+	return unpredictable_count;
+}
+
+void decompressDataSeries_float_2D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, unsigned char* comp_data, float* hist_data){
+
+	size_t dim0_offset = r2;
+	size_t num_elements = r1 * r2;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t num_blocks = num_x * num_y;
+
+	float realPrecision = bytesToFloat(comp_data_pos);
+	comp_data_pos += sizeof(float);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+	//printf("reg_count: %ld\n", reg_count);
+
+	int coeff_intvRadius[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	int * coeff_type[3];
+	float precision[3];
+	float * coeff_unpred_data[3];
+	if(reg_count > 0){
+		for(int i=0; i<3; i++){
+			precision[i] = bytesToFloat(comp_data_pos);
+			comp_data_pos += sizeof(float);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[3] = {0.0};
+	int coeff_unpred_data_count[3] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = intervals/2;
+	
+	int * type;
+
+	float * data_pos = *data;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+	size_t cur_unpred_count;
+
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+
+					float * block_data_pos = data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					float d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == intvRadius){
+								*block_data_pos = mean;
+							}
+							else if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								if(type_ < intvRadius) type_ += 1;
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						float pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						float * block_data_pos = data_pos;
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+					
+					float * block_data_pos = data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					float d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						float pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						float * block_data_pos = data_pos;
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), num_elements*sizeof(float));
+#endif	
+	
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+
+void decompressDataSeries_float_3D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, float* hist_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+
+	float realPrecision = bytesToFloat(comp_data_pos);
+	comp_data_pos += sizeof(float);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	float precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToFloat(comp_data_pos);
+			comp_data_pos += sizeof(float);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = intervals/2;
+	
+	int * type;
+	float * data_pos = *data;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		// type = result_type;
+
+		// for(size_t i=0; i<num_x; i++){
+		// 	for(size_t j=0; j<num_y; j++){
+		// 		for(size_t k=0; k<num_z; k++){
+		// 			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		// 			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		// 			offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		// 			data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		// 			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		// 			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		// 			current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+		// 			// type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		// 			// type = result_type + type_offset;
+		// 			size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		// 			// index = i * num_y * num_z + j * num_z + k;
+
+		// 			// printf("i j k: %ld %ld %ld\toffset: %ld %ld %ld\tindicator: %ld\n", i, j, k, offset_x, offset_y, offset_z, indicator[index]);
+		// 			if(*indicator_pos){
+		// 				// decompress by SZ
+		// 				// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+		// 				float * block_data_pos = data_pos;
+		// 				float pred;
+		// 				size_t index = 0;
+		// 				int type_;
+		// 				// d111 is current data
+		// 				size_t unpredictable_count = 0;
+		// 				float d000, d001, d010, d011, d100, d101, d110;
+		// 				for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 					for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 						for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 							type_ = type[index];
+		// 							if(type_ == intvRadius){
+		// 								*block_data_pos = mean;
+		// 							}
+		// 							else if(type_ == 0){
+		// 								*block_data_pos = unpred_data[unpredictable_count ++];
+		// 							}
+		// 							else{
+		// 								d000 = d001 = d010 = d011 = d100 = d101 = d110 = 1;
+		// 								if(i == 0 && ii == 0){
+		// 									d000 = d001 = d010 = d011 = 0;
+		// 								}
+		// 								if(j == 0 && jj == 0){
+		// 									d000 = d001 = d100 = d101 = 0;
+		// 								}
+		// 								if(k == 0 && kk == 0){
+		// 									d000 = d010 = d100 = d110 = 0;
+		// 								}
+		// 								if(d000){
+		// 									d000 = block_data_pos[- dim0_offset - dim1_offset - 1];
+		// 								}
+		// 								if(d001){
+		// 									d001 = block_data_pos[- dim0_offset - dim1_offset];
+		// 								}
+		// 								if(d010){
+		// 									d010 = block_data_pos[- dim0_offset - 1];
+		// 								}
+		// 								if(d011){
+		// 									d011 = block_data_pos[- dim0_offset];
+		// 								}
+		// 								if(d100){
+		// 									d100 = block_data_pos[- dim1_offset - 1];
+		// 								}
+		// 								if(d101){
+		// 									d101 = block_data_pos[- dim1_offset];
+		// 								}
+		// 								if(d110){
+		// 									d110 = block_data_pos[- 1];
+		// 								}
+		// 								if(type_ < intvRadius) type_ += 1;
+		// 								pred = d110 + d101 + d011 - d100 - d010 - d001 + d000;
+		// 								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 							}
+		// 							index ++;
+		// 							block_data_pos ++;
+		// 						}
+		// 						block_data_pos += dim1_offset - current_blockcount_z;
+		// 					}
+		// 					block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 				}
+		// 				cur_unpred_count = unpredictable_count;
+		// 			}
+		// 			else{
+		// 				// decompress by regression
+		// 				{
+		// 					//restore regression coefficients
+		// 					float pred;
+		// 					int type_;
+		// 					for(int e=0; e<4; e++){
+		// 						// if(i == 0 && j == 0 && k == 19){
+		// 						// 	printf("~\n");
+		// 						// }
+		// 						type_ = coeff_type[e][coeff_index];
+		// 						if (type_ != 0){
+		// 							pred = last_coefficients[e];
+		// 							last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+		// 						}
+		// 						else{
+		// 							last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+		// 							coeff_unpred_data_count[e] ++;
+		// 						}
+		// 						if(fabs(last_coefficients[e]) > 10000){
+		// 							printf("%d %d %d-%d: pred %.4f type %d precision %.4g last_coefficients %.4g\n", i, j, k, e, pred, type_, precision[e], last_coefficients[e]);
+		// 							exit(0);
+		// 						}
+		// 					}
+		// 					coeff_index ++;
+		// 				}
+		// 				{
+		// 					float * block_data_pos = data_pos;
+		// 					float pred;
+		// 					int type_;
+		// 					size_t index = 0;
+		// 					size_t unpredictable_count = 0;
+		// 					for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 						for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 							for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 								if(block_data_pos - (*data) == 19470788){
+		// 									printf("dec stop\n");
+		// 								}
+
+		// 								type_ = type[index];
+		// 								if (type_ != 0){
+		// 									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+		// 									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 								}
+		// 								else{
+		// 									*block_data_pos = unpred_data[unpredictable_count ++];
+		// 								}
+		// 								index ++;	
+		// 								block_data_pos ++;
+		// 							}
+		// 							block_data_pos += dim1_offset - current_blockcount_z;
+		// 						}
+		// 						block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 					}
+		// 					cur_unpred_count = unpredictable_count;
+		// 				}
+		// 			}
+
+		// 			type += current_block_elements;
+		// 			indicator_pos ++;
+		// 			unpred_data += cur_unpred_count;
+		// 			// decomp_unpred += cur_unpred_count;
+		// 			// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+		// 			// fflush(stdout);
+		// 		}
+		// 	}
+		// }
+
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	else{
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(hist_data, (*data), num_elements*sizeof(float));
+#endif	
+
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+void decompressDataSeries_float_3D_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	decode(comp_data_pos, num_blocks*max_num_block_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = intervals/2;
+	
+	int * type;
+	float * data_pos = *data;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	int block_dim0_offset = dec_buffer_size*dec_buffer_size;
+	int block_dim1_offset = dec_buffer_size;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 1){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+
+				}
+			}
+		}
+
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+				}
+			}
+		}
+	}
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+void decompressDataSeries_float_3D_decompression_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	//updateQuantizationInfo(intervals);
+	int intvRadius = intervals/2;
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	size_t compressed_blockwise_unpred_count_size;
+	memcpy(&compressed_blockwise_unpred_count_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	int * blockwise_unpred_count = (int *)SZ_decompress(SZ_INT32, comp_data_pos, compressed_blockwise_unpred_count_size, 0, 0, 0, 0, num_blocks);
+	comp_data_pos += compressed_blockwise_unpred_count_size;
+
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	size_t compressed_type_array_block_size;
+	memcpy(&compressed_type_array_block_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	unsigned short * type_array_block_size = (unsigned short *)SZ_decompress(SZ_INT16, comp_data_pos, compressed_type_array_block_size, 0, 0, 0, 0, num_blocks);
+	comp_data_pos += compressed_type_array_block_size;
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	// decode(comp_data_pos, num_blocks*max_num_block_elements, root, result_type);
+	int * block_type = result_type;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){	
+				decode(comp_data_pos, max_num_block_elements, root, block_type);
+				comp_data_pos += *type_array_block_size_pos;
+				type_array_block_size_pos ++;
+				block_type += max_num_block_elements;
+			}
+		}
+	}
+	free(type_array_block_size);
+
+	SZ_ReleaseHuffman(huffmanTree);	
+	int * type;
+	float * data_pos = *data;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	int block_dim0_offset = dec_buffer_size*dec_buffer_size;
+	int block_dim1_offset = dec_buffer_size;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 1){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+
+				}
+			}
+		}
+
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+				}
+			}
+		}
+	}
+	free(blockwise_unpred_count);
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+
+#ifdef HAVE_RANDOMACCESS
+void decompressDataSeries_float_1D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t s1, size_t e1, unsigned char* comp_data){
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x;
+	num_x = (r1 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size;
+	size_t num_blocks = num_x;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[2];
+	int * coeff_result_type = (int *) malloc(num_blocks*2*sizeof(int));
+	int * coeff_type[2];
+	double precision[2];
+	float * coeff_unpred_data[2];
+	if(reg_count > 0){
+		for(int i=0; i<2; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[2] = {0.0};
+	int coeff_unpred_data_count[2] = {0};
+	// decompress coeffcients
+	float * reg_params = (float *) malloc(2*num_blocks*sizeof(float));
+	memset(reg_params, 0, 2*num_blocks*sizeof(float));
+	float * reg_params_pos = reg_params;
+	size_t coeff_index = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]){
+			float pred;
+			int type_;
+			for(int e=0; e<2; e++){
+				type_ = coeff_type[e][coeff_index];
+				if (type_ != 0){
+					pred = last_coefficients[e];
+					last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+				}
+				else{
+					last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+					coeff_unpred_data_count[e] ++;
+				}
+				reg_params_pos[e] = last_coefficients[e];
+			}
+			coeff_index ++;
+		}
+		reg_params_pos += 2;
+	}
+
+	//updateQuantizationInfo(intervals);
+	int intvRadius = intervals/2;
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	size_t compressed_blockwise_unpred_count_size;
+	memcpy(&compressed_blockwise_unpred_count_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	int * blockwise_unpred_count = NULL;
+	SZ_decompress_args_int32(&blockwise_unpred_count, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_blockwise_unpred_count_size);
+	comp_data_pos += compressed_blockwise_unpred_count_size;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t cur_offset = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		unpred_offset[i] = cur_offset;
+		cur_offset += blockwise_unpred_count[i];
+	}
+
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	size_t compressed_type_array_block_size;
+	memcpy(&compressed_type_array_block_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	unsigned short * type_array_block_size = NULL;
+	SZ_decompress_args_uint16(&type_array_block_size, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_type_array_block_size);
+
+	comp_data_pos += compressed_type_array_block_size;
+
+	// compute given area
+	size_t sx = s1 / block_size;
+	size_t ex = (e1 - 1) / block_size + 1;
+
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+	size_t * type_array_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t * type_array_offset_pos = type_array_offset;
+	size_t cur_type_array_offset = 0;
+	for(size_t i=0; i<num_x; i++){
+		*(type_array_offset_pos++) = cur_type_array_offset;
+		cur_type_array_offset += *(type_array_block_size_pos++);
+	}
+	free(type_array_block_size);
+	int * result_type = (int *) malloc((ex - sx)*block_size*sizeof(int));
+	int * block_type = result_type;
+	for(size_t i=sx; i<ex; i++){
+		size_t index = i;
+		decode(comp_data_pos + type_array_offset[index], max_num_block_elements, root, block_type);
+		block_type += max_num_block_elements;
+	}
+	SZ_ReleaseHuffman(huffmanTree);
+	free(type_array_offset);
+
+	int * type = NULL;
+	float * data_pos = *data;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	// printf("decompression start, %d %d %d, %d %d %d, total unpred %ld\n", sx, sy, sz, ex, ey, ez, total_unpred);
+	// fflush(stdout);
+	float * dec_block_data = (float *) malloc((ex - sx)*block_size*sizeof(float));
+	memset(dec_block_data, 0, (ex - sx)*block_size*sizeof(float));
+	if(use_mean){
+		for(size_t i=sx; i<ex; i++){
+			data_pos = dec_buffer + 1;
+			type = result_type + (i-sx) * block_size;
+			coeff_index = i;
+			float * block_unpred = unpred_data + unpred_offset[coeff_index];
+			if(indicator[coeff_index]){
+				// decompress by SZ
+				float * block_data_pos;
+				float pred;
+				size_t index = 0;
+				int type_;
+				size_t unpredictable_count = 0;
+				for(size_t ii=0; ii<block_size; ii++){
+					block_data_pos = data_pos + ii;
+					type_ = type[index];
+					if(type_ == 1){
+						*block_data_pos = mean;
+					}
+					else if(type_ == 0){
+						*block_data_pos = block_unpred[unpredictable_count ++];
+					}
+					else{
+						pred = block_data_pos[-1];
+						*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+					}
+					index ++;
+				}
+			}
+			else{
+				// decompress by regression
+				reg_params_pos = reg_params + 2*coeff_index;
+				{
+					float pred;
+					int type_;
+					size_t index = 0;
+					size_t unpredictable_count = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						type_ = type[index];
+						if (type_ != 0){
+							pred = reg_params_pos[0] * ii + reg_params_pos[1];
+							data_pos[ii] = pred + 2 * (type_ - intvRadius) * realPrecision;
+						}
+						else{
+							data_pos[ii] = block_unpred[unpredictable_count ++];
+						}
+						index ++;	
+					}
+				}
+			}
+
+			// mv data back
+			block_data_pos_x = dec_block_data + (i-sx)*block_size;
+			for(int ii=0; ii<block_size; ii++){
+				if(i*block_size + ii >= r1) break;
+				*block_data_pos_x = data_pos[ii];
+				block_data_pos_x ++;
+			}
+		}
+
+	}
+	else{
+		for(size_t i=sx; i<ex; i++){
+			data_pos = dec_buffer + 1;
+			type = result_type + (i-sx) * block_size;
+			coeff_index = i;
+			float * block_unpred = unpred_data + unpred_offset[coeff_index];
+			if(indicator[coeff_index]){
+				// decompress by SZ
+				float * block_data_pos;
+				float pred;
+				size_t index = 0;
+				int type_;
+				size_t unpredictable_count = 0;
+				for(size_t ii=0; ii<block_size; ii++){
+					block_data_pos = data_pos + ii;
+					type_ = type[index];
+					if(type_ == 0){
+						*block_data_pos = block_unpred[unpredictable_count ++];
+					}
+					else{
+						pred = block_data_pos[-1];
+						*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+					}
+					index ++;
+				}
+			}
+			else{
+				// decompress by regression
+				reg_params_pos = reg_params + 2*coeff_index;
+				{
+					float pred;
+					int type_;
+					size_t index = 0;
+					size_t unpredictable_count = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						type_ = type[index];
+						if (type_ != 0){
+							pred = reg_params_pos[0] * ii + reg_params_pos[1];
+							data_pos[ii] = pred + 2 * (type_ - intvRadius) * realPrecision;
+						}
+						else{
+							data_pos[ii] = block_unpred[unpredictable_count ++];
+						}
+						index ++;	
+					}
+				}
+			}
+
+			// mv data back
+			block_data_pos_x = dec_block_data + (i-sx)*block_size;
+			for(int ii=0; ii<block_size; ii++){
+				if(i*block_size + ii >= r1) break;
+				*block_data_pos_x = data_pos[ii];
+				block_data_pos_x ++;
+			}
+		}
+	}
+	free(unpred_offset);
+	free(reg_params);
+	free(blockwise_unpred_count);
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+
+	// extract data
+	int resi_x = s1 % block_size;
+	*data = (float*) malloc(sizeof(float)*(e1 - s1));
+	float * final_data_pos = *data;
+	float * block_data_pos = dec_block_data + resi_x;
+	for(int i=0; i<(e1 - s1); i++){
+		*(final_data_pos++) = *(block_data_pos++);
+	}
+	free(dec_block_data);
+}
+
+void decompressDataSeries_float_2D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2, unsigned char* comp_data){
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size;
+	size_t num_blocks = num_x * num_y;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	int * coeff_type[3];
+	double precision[3];
+	float * coeff_unpred_data[3];
+	if(reg_count > 0){
+		for(int i=0; i<3; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[3] = {0.0};
+	int coeff_unpred_data_count[3] = {0};
+	// decompress coeffcients
+	float * reg_params = (float *) malloc(3*num_blocks*sizeof(float));
+	memset(reg_params, 0, 3*num_blocks*sizeof(float));
+	float * reg_params_pos = reg_params;
+	size_t coeff_index = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]){
+			float pred;
+			int type_;
+			for(int e=0; e<3; e++){
+				type_ = coeff_type[e][coeff_index];
+				if (type_ != 0){
+					pred = last_coefficients[e];
+					last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+				}
+				else{
+					last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+					coeff_unpred_data_count[e] ++;
+				}
+				reg_params_pos[e] = last_coefficients[e];
+			}
+			coeff_index ++;
+		}
+		reg_params_pos += 3;
+	}
+
+	//updateQuantizationInfo(intervals);
+	int intvRadius = intervals/2;
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	size_t compressed_blockwise_unpred_count_size;
+	memcpy(&compressed_blockwise_unpred_count_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	int * blockwise_unpred_count = NULL;
+	SZ_decompress_args_int32(&blockwise_unpred_count, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_blockwise_unpred_count_size);
+	comp_data_pos += compressed_blockwise_unpred_count_size;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t cur_offset = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		unpred_offset[i] = cur_offset;
+		cur_offset += blockwise_unpred_count[i];
+	}
+
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	size_t compressed_type_array_block_size;
+	memcpy(&compressed_type_array_block_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	unsigned short * type_array_block_size = NULL;
+	SZ_decompress_args_uint16(&type_array_block_size, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_type_array_block_size);
+
+	comp_data_pos += compressed_type_array_block_size;
+
+	// compute given area
+	size_t sx = s1 / block_size;
+	size_t sy = s2 / block_size;
+	size_t ex = (e1 - 1) / block_size + 1;
+	size_t ey = (e2 - 1) / block_size + 1;
+
+	size_t dec_block_dim0_offset = (ey - sy)*block_size;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+	size_t * type_array_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t * type_array_offset_pos = type_array_offset;
+	size_t cur_type_array_offset = 0;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			*(type_array_offset_pos++) = cur_type_array_offset;
+			cur_type_array_offset += *(type_array_block_size_pos++);
+		}
+	}
+	free(type_array_block_size);
+	int * result_type = (int *) malloc((ex - sx)*block_size * dec_block_dim0_offset* sizeof(int));
+	int * block_type = result_type;
+	for(size_t i=sx; i<ex; i++){
+		for(size_t j=sy; j<ey; j++){
+			size_t index = i*num_y + j;
+			decode(comp_data_pos + type_array_offset[index], max_num_block_elements, root, block_type);
+			block_type += max_num_block_elements;
+		}
+	}
+	SZ_ReleaseHuffman(huffmanTree);
+	free(type_array_offset);
+
+	int * type = NULL;
+	float * data_pos = *data;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	int block_dim0_offset = dec_buffer_size;
+	// printf("decompression start, %d %d %d, %d %d %d, total unpred %ld\n", sx, sy, sz, ex, ey, ez, total_unpred);
+	// fflush(stdout);
+	float * dec_block_data = (float *) malloc((ex - sx)*block_size * dec_block_dim0_offset*sizeof(float));
+	memset(dec_block_data, 0, (ex - sx)*block_size * dec_block_dim0_offset*sizeof(float));
+	if(use_mean){
+		for(size_t i=sx; i<ex; i++){
+			for(size_t j=sy; j<ey; j++){
+				data_pos = dec_buffer + dec_buffer_size + 1;
+				type = result_type + (i-sx) * block_size * block_size * (ey - sy) +  (j-sy) * block_size * block_size;
+				coeff_index = i*num_y + j;
+				float * block_unpred = unpred_data + unpred_offset[coeff_index];
+				if(indicator[coeff_index]){
+					// decompress by SZ
+					float * block_data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					size_t unpredictable_count = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							block_data_pos = data_pos + ii*block_dim0_offset + jj;
+							type_ = type[index];
+							if(type_ == 1){
+								*block_data_pos = mean;
+							}
+							else if(type_ == 0){
+								*block_data_pos = block_unpred[unpredictable_count ++];
+							}
+							else{
+								pred = block_data_pos[-1] + block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim0_offset - 1];
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+						}
+					}
+				}
+				else{
+					// decompress by regression
+					reg_params_pos = reg_params + 3*coeff_index;
+					{
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = reg_params_pos[0] * ii + reg_params_pos[1] * jj + reg_params_pos[2];
+									data_pos[ii*block_dim0_offset + jj] = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									data_pos[ii*block_dim0_offset + jj] = block_unpred[unpredictable_count ++];
+								}
+								index ++;	
+							}
+						}
+					}
+				}
+
+				// mv data back
+				block_data_pos_x = dec_block_data + (i-sx)*block_size * dec_block_dim0_offset + (j-sy)*block_size;
+				for(int ii=0; ii<block_size; ii++){
+					if(i*block_size + ii >= r1) break;
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						if(j*block_size + jj >= r2) break;
+						*block_data_pos_y = data_pos[ii*dec_buffer_size + jj];
+						block_data_pos_y ++;
+					}
+					block_data_pos_x += dec_block_dim0_offset;
+				}
+
+			}
+		}
+
+	}
+	else{
+		for(size_t i=sx; i<ex; i++){
+			for(size_t j=sy; j<ey; j++){
+				data_pos = dec_buffer + dec_buffer_size + 1;
+				type = result_type + (i-sx) * block_size * block_size * (ey - sy) +  (j-sy) * block_size * block_size;
+				coeff_index = i*num_y + j;
+				float * block_unpred = unpred_data + unpred_offset[coeff_index];
+				if(indicator[coeff_index]){
+					// decompress by SZ
+					float * block_data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					size_t unpredictable_count = 0;
+					for(size_t ii=0; ii<block_size; ii++){
+						for(size_t jj=0; jj<block_size; jj++){
+							block_data_pos = data_pos + ii*block_dim0_offset + jj;
+							type_ = type[index];
+							if(type_ == 0){
+								*block_data_pos = block_unpred[unpredictable_count ++];
+							}
+							else{
+								pred = block_data_pos[-1] + block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim0_offset - 1];
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+						}
+					}
+				}
+				else{
+					// decompress by regression
+					reg_params_pos = reg_params + 3*coeff_index;
+					{
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = reg_params_pos[0] * ii + reg_params_pos[1] * jj + reg_params_pos[2];
+									data_pos[ii*block_dim0_offset + jj] = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									data_pos[ii*block_dim0_offset + jj] = block_unpred[unpredictable_count ++];
+								}
+								index ++;	
+							}
+						}
+					}
+				}
+
+				// mv data back
+				block_data_pos_x = dec_block_data + (i-sx)*block_size * dec_block_dim0_offset + (j-sy)*block_size;
+				for(int ii=0; ii<block_size; ii++){
+					if(i*block_size + ii >= r1) break;
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						if(j*block_size + jj >= r2) break;
+						*block_data_pos_y = data_pos[ii*dec_buffer_size + jj];
+						block_data_pos_y ++;
+					}
+					block_data_pos_x += dec_block_dim0_offset;
+				}
+			}
+		}
+	}
+	free(unpred_offset);
+	free(reg_params);
+	free(blockwise_unpred_count);
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+
+	// extract data
+	int resi_x = s1 % block_size;
+	int resi_y = s2 % block_size;
+	*data = (float*) malloc(sizeof(float)*(e1 - s1) * (e2 - s2));
+	float * final_data_pos = *data;
+	for(int i=0; i<(e1 - s1); i++){
+		float * block_data_pos = dec_block_data + (i+resi_x)*dec_block_dim0_offset + resi_y;
+		for(int j=0; j<(e2 - s2); j++){
+			*(final_data_pos++) = *(block_data_pos++);
+		}
+	}
+	free(dec_block_data);
+}
+
+void decompressDataSeries_float_3D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3, unsigned char* comp_data){
+
+	// size_t dim0_offset = r2 * r3;
+	// size_t dim1_offset = r3;
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	//updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	// decompress coeffcients
+	float * reg_params = (float *) malloc(4*num_blocks*sizeof(float));
+	memset(reg_params, 0, 4*num_blocks*sizeof(float));
+	float * reg_params_pos = reg_params;
+	size_t coeff_index = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]){
+			float pred;
+			int type_;
+			for(int e=0; e<4; e++){
+				type_ = coeff_type[e][coeff_index];
+				if (type_ != 0){
+					pred = last_coefficients[e];
+					last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+				}
+				else{
+					last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+					coeff_unpred_data_count[e] ++;
+				}
+				reg_params_pos[e] = last_coefficients[e];
+			}
+			coeff_index ++;
+		}
+		reg_params_pos += 4;
+	}
+
+	//updateQuantizationInfo(intervals);
+	int intvRadius = intervals/2;
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	size_t compressed_blockwise_unpred_count_size;
+	memcpy(&compressed_blockwise_unpred_count_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	int * blockwise_unpred_count = NULL;
+	SZ_decompress_args_int32(&blockwise_unpred_count, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_blockwise_unpred_count_size);
+	comp_data_pos += compressed_blockwise_unpred_count_size;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t cur_offset = 0;
+	for(size_t i=0; i<num_blocks; i++){
+		unpred_offset[i] = cur_offset;
+		cur_offset += blockwise_unpred_count[i];
+	}
+
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	size_t compressed_type_array_block_size;
+	memcpy(&compressed_type_array_block_size, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	unsigned short * type_array_block_size = NULL;
+	SZ_decompress_args_uint16(&type_array_block_size, 0, 0, 0, 0, num_blocks, comp_data_pos, compressed_type_array_block_size);
+
+	comp_data_pos += compressed_type_array_block_size;
+
+	// compute given area
+	size_t sx = s1 / block_size;
+	size_t sy = s2 / block_size;
+	size_t sz = s3 / block_size;
+	size_t ex = (e1 - 1) / block_size + 1;
+	size_t ey = (e2 - 1) / block_size + 1;
+	size_t ez = (e3 - 1) / block_size + 1;
+
+	size_t dec_block_dim1_offset = (ez - sz)*block_size;
+	size_t dec_block_dim0_offset = dec_block_dim1_offset * (ey - sy)*block_size;
+	unsigned short * type_array_block_size_pos = type_array_block_size;
+	size_t * type_array_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t * type_array_offset_pos = type_array_offset;
+	size_t cur_type_array_offset = 0;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){	
+				*(type_array_offset_pos++) = cur_type_array_offset;
+				cur_type_array_offset += *(type_array_block_size_pos++);
+			}
+		}
+	}
+	free(type_array_block_size);
+	int * result_type = (int *) malloc((ex - sx)*block_size * dec_block_dim0_offset* sizeof(int));
+	int * block_type = result_type;
+	for(size_t i=sx; i<ex; i++){
+		for(size_t j=sy; j<ey; j++){
+			for(size_t k=sz; k<ez; k++){
+				size_t index = i*num_y*num_z + j*num_z + k;
+				decode(comp_data_pos + type_array_offset[index], max_num_block_elements, root, block_type);
+				block_type += max_num_block_elements;
+			}
+		}
+	}
+	SZ_ReleaseHuffman(huffmanTree);
+	free(type_array_offset);
+
+	int * type = NULL;
+	float * data_pos = *data;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	int block_dim0_offset = dec_buffer_size*dec_buffer_size;
+	int block_dim1_offset = dec_buffer_size;
+
+	// printf("decompression start, %d %d %d, %d %d %d, total unpred %ld\n", sx, sy, sz, ex, ey, ez, total_unpred);
+	// fflush(stdout);
+	float * dec_block_data = (float *) malloc((ex - sx)*block_size * dec_block_dim0_offset*sizeof(float));
+	memset(dec_block_data, 0, (ex - sx)*block_size * dec_block_dim0_offset*sizeof(float));
+	if(use_mean){
+		for(size_t i=sx; i<ex; i++){
+			for(size_t j=sy; j<ey; j++){
+				for(size_t k=sz; k<ez; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					type = result_type + (i-sx) * block_size * block_size * (ey - sy) * block_size * (ez - sz) +  (j-sy) * block_size * block_size * block_size * (ez - sz) + (k-sz) * block_size * block_size * block_size;
+					coeff_index = i*num_y*num_z + j*num_z + k;
+					float * block_unpred = unpred_data + unpred_offset[coeff_index];
+					if(indicator[coeff_index]){
+						// decompress by SZ
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 1){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = block_unpred[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+					}
+					else{
+						// decompress by regression
+						reg_params_pos = reg_params + 4*coeff_index;
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = reg_params_pos[0] * ii + reg_params_pos[1] * jj + reg_params_pos[2] * kk + reg_params_pos[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = block_unpred[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+						}
+					}
+
+					// mv data back
+					block_data_pos_x = dec_block_data + (i-sx)*block_size * dec_block_dim0_offset + (j-sy)*block_size * dec_block_dim1_offset + (k-sz)*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dec_block_dim1_offset;
+						}
+						block_data_pos_x += dec_block_dim0_offset;
+					}
+
+				}
+			}
+		}
+
+	}
+	else{
+		for(size_t i=sx; i<ex; i++){
+			for(size_t j=sy; j<ey; j++){
+				for(size_t k=sz; k<ez; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					type = result_type + (i-sx) * block_size * block_size * (ey - sy) * block_size * (ez - sz) +  (j-sy) * block_size * block_size * block_size * (ez - sz) + (k-sz) * block_size * block_size * block_size;
+					coeff_index = i*num_y*num_z + j*num_z + k;
+					float * block_unpred = unpred_data + unpred_offset[coeff_index];
+					if(indicator[coeff_index]){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = block_unpred[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+					}
+					else{
+						// decompress by regression
+						reg_params_pos = reg_params + 4*coeff_index;
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = reg_params_pos[0] * ii + reg_params_pos[1] * jj + reg_params_pos[2] * kk + reg_params_pos[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = block_unpred[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+						}
+					}
+					// mv data back
+					block_data_pos_x = dec_block_data + (i-sx)*block_size * dec_block_dim0_offset + (j-sy)*block_size * dec_block_dim1_offset + (k-sz)*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dec_block_dim1_offset;
+						}
+						block_data_pos_x += dec_block_dim0_offset;
+					}
+
+				}
+			}
+		}
+	}
+	free(unpred_offset);
+	free(reg_params);
+	free(blockwise_unpred_count);
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+
+	// extract data
+	int resi_x = s1 % block_size;
+	int resi_y = s2 % block_size;
+	int resi_z = s3 % block_size;
+	*data = (float*) malloc(sizeof(float)*(e1 - s1) * (e2 - s2) * (e3 - s3));
+	float * final_data_pos = *data;
+	for(int i=0; i<(e1 - s1); i++){
+		for(int j=0; j<(e2 - s2); j++){
+			float * block_data_pos = dec_block_data + (i+resi_x)*dec_block_dim0_offset + (j+resi_y)*dec_block_dim1_offset + resi_z;
+			for(int k=0; k<(e3 - s3); k++){
+				*(final_data_pos++) = *(block_data_pos++);
+			}
+		}
+	}
+	free(dec_block_data);
+
+}
+
+int SZ_decompress_args_randomaccess_float(float** newData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, 
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1, // start point
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1, // end point
+unsigned char* cmpBytes, size_t cmpSize)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;	
+
+	confparams_dec->randomAccess = 1;
+	
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+	
+	if(cmpSize!=8+4+MetaDataByteLength && cmpSize!=8+8+MetaDataByteLength) //4,8 means two posibilities of SZ_SIZE_TYPE
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
+		{
+			if(confparams_dec->losslessCompressor!=-1)
+				confparams_dec->szMode = SZ_BEST_COMPRESSION;
+			else
+				confparams_dec->szMode = SZ_BEST_SPEED;			
+		}
+		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION || confparams_dec->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;	
+
+	TightDataPointStorageF* tdps;
+	new_TightDataPointStorageF_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int floatSize = sizeof(float);
+	if(tdps->isLossless)
+	{
+		*newData = (float*)malloc(floatSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*floatSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=floatSize)
+				(*newData)[i] = bytesToFloat(p);
+		}		
+	}
+	else 
+	{
+		if(confparams_dec->randomAccess == 0 && (s1+s2+s3+s4+s5>0 || (r5-e5+r4-e4+r3-e3+r2-e2+r1-e1 > 0)))
+		{
+			printf("Error: you specified the random access mode for decompression, but the compressed data were generate in the non-random-access way.!\n");
+			status = SZ_DERR;
+		}
+		else if (dim == 1)
+		{
+			//printf("Error: random access mode doesn't support 1D yet, but only 3D.\n");
+			decompressDataSeries_float_1D_decompression_given_areas_with_blocked_regression(newData, r1, s1, e1, tdps->raBytes);
+			//status = SZ_DERR;
+		}
+		else if(dim == 2)
+		{
+			//printf("Error: random access mode doesn't support 2D yet, but only 3D.\n");
+			decompressDataSeries_float_2D_decompression_given_areas_with_blocked_regression(newData, r2, r1, s2, s1, e2, e1, tdps->raBytes);
+			//status = SZ_DERR;
+		}	
+		else if(dim == 3)
+		{
+			decompressDataSeries_float_3D_decompression_given_areas_with_blocked_regression(newData, r3, r2, r1, s3, s2, s1, e3, e2, e1, tdps->raBytes);
+			status = SZ_SCES;
+		}
+		else if(dim == 4)
+		{
+			printf("Error: random access mode doesn't support 4D yet, but only 3D.\n");
+			status = SZ_DERR;			
+		}	
+		else
+		{
+			printf("Error: currently support only at most 4 dimensions!\n");
+			status = SZ_DERR;
+		}	
+	}	
+	
+	free_TightDataPointStorageF2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
+		free(szTmpBytes);
+	return status;
+}
+#endif
diff --git a/deps/SZ/sz/src/szd_float_pwr.c b/deps/SZ/sz/src/szd_float_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..46b8f1d7e719a73b7f412921a54cd59241fad41c
--- /dev/null
+++ b/deps/SZ/sz/src/szd_float_pwr.c
@@ -0,0 +1,1528 @@
+/**
+ *  @file szd_float_pwr.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date Feb., 2019
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageF.h"
+#include "CompressElement.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "sz_float_pwr.h"
+#include "utility.h"
+//#include "rw.h"
+//
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+
+void decompressDataSeries_float_1D_pwr(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	unsigned char tmpPrecBytes[4] = {0}; //used when needing to convert bytes to float values
+	unsigned char* bp = tdps->pwrErrBoundBytes;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	float interval = 0;// = (float)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength = 0, reqBytesLength = 0, resiBitsLength = 0, resiBits = 0; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue = 0, realPrecision = 0;
+	
+	medianValue = tdps->medianValue;
+	
+	int type_, updateReqLength = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		if(i%tdps->segment_size==0)
+		{
+			tmpPrecBytes[0] = *(bp++);
+			tmpPrecBytes[1] = *(bp++);
+			tmpPrecBytes[2] = 0;
+			tmpPrecBytes[3] = 0;
+			realPrecision = bytesToFloat(tmpPrecBytes);
+			interval = realPrecision*2;
+			updateReqLength = 0;
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(leadNum);
+	free(type);
+	return;
+}
+
+float* extractRealPrecision_2D_float(size_t R1, size_t R2, int blockSize, TightDataPointStorageF* tdps)
+{
+	size_t i,j,k=0, I;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	float* result = (float*)malloc(sizeof(float)*R1*R2);
+	for(i=0;i<R1;i++)
+	{
+		I = i*R2;
+		for(j=0;j<R2;j++)
+		{
+			tmpBytes[0] = bytes[k++];
+			tmpBytes[1] = bytes[k++];
+			result[I+j]=bytesToFloat(tmpBytes);
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_float_2D_pwr(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, realPrecision;
+	int type_;	
+	float pred1D, pred2D;
+	size_t ii, jj, II = 0, JJ = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_2D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	float* pwrErrBound = extractRealPrecision_2D_float(R1, R2, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits		
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{	
+		if(jj%blockSize==0)
+		{
+			II = 0;
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];						
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R2+JJ];				
+		updateReqLength = 0;
+		
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R2+JJ];			
+			updateReqLength = 0;			
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}				
+				
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+float* extractRealPrecision_3D_float(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageF* tdps)
+{
+	size_t i,j,k=0, IR, JR, p = 0;
+	size_t R23 = R2*R3;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	float* result = (float*)malloc(sizeof(float)*R1*R2*R3);
+	for(i=0;i<R1;i++)
+	{
+		IR = i*R23;
+		for(j=0;j<R2;j++)
+		{
+			JR = j*R3;
+			for(k=0;k<R3;k++)
+			{
+				tmpBytes[0] = bytes[p++];
+				tmpBytes[1] = bytes[p++];
+				result[IR+JR+k]=bytesToFloat(tmpBytes);				
+			}
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;
+	float medianValue, exactData, realPrecision;
+	int type_;
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, II = 0, JJ = 0, KK = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_3D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	size_t R3 = 1+(r3-1)/blockSize;
+	size_t R23 = R2*R3;
+	float* pwrErrBound = extractRealPrecision_3D_float(R1, R2, R3, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			KK = 0;//dimension 1 (top)
+			II = 0;//dimension 2 (mid)
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;		
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R3+JJ];
+		updateReqLength = 0;		
+
+		index = ii*r3;
+		
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r3];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R3+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;		
+		if(kk%blockSize==0)
+			KK++;
+		II = 0;
+		JJ = 0;
+
+		realPrecision = pwrErrBound[KK*R23];			
+		updateReqLength = 0;			
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r23];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+
+			realPrecision = pwrErrBound[KK*R23+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];			
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+			
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			
+			if(ii%blockSize==0)
+				II++;
+			JJ = 0;
+			
+			realPrecision = pwrErrBound[KK*R23+II*R3];			
+			updateReqLength = 0;						
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				if(jj%blockSize==0)
+					JJ++;
+
+				realPrecision = pwrErrBound[KK*R23+II*R3+JJ];			
+				updateReqLength = 0;				
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];					
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					if(updateReqLength==0)
+					{
+						computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;				
+						updateReqLength = 1;
+					}
+				
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+					
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	float *posGroups, *negGroups, *groups;
+	float pos_01_group, neg_01_group;
+	int *posFlags, *negFlags;
+	
+	updateQuantizationInfo(tdps->intervals);
+	
+	unsigned char* leadNum;
+	double interval;// = (float)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	createRangeGroups_float(&posGroups, &negGroups, &posFlags, &negFlags);
+	
+	float realGroupPrecision;
+	float realPrecision = tdps->realPrecision;
+	char* groupID = decompressGroupIDArray(tdps->pwrErrBoundBytes, tdps->dataSeriesLength);
+	
+	//note that the groupID values here are [1,2,3,....,18] or [-1,-2,...,-18]
+	
+	double* groupErrorBounds = generateGroupErrBounds(confparams_dec->errorBoundMode, realPrecision, confparams_dec->pw_relBoundRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+		
+	size_t nbBins = (size_t)(1/confparams_dec->pw_relBoundRatio + 0.5);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength = 0, resiBitsLength = 0, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, curValue, predValue;
+	
+	medianValue = tdps->medianValue;
+	
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+							// in resiMidBits, p is to track the
+							// byte_index of resiMidBits, l is for
+							// leadNum
+							
+	int type_, updateReqLength = 0;
+	char rawGrpID = 0, indexGrpID = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		rawGrpID = groupID[i];
+		
+		if(rawGrpID >= 2)
+		{
+			groups = posGroups;
+			indexGrpID = rawGrpID - 2;
+		}
+		else if(rawGrpID <= -2)
+		{
+			groups = negGroups;
+			indexGrpID = -rawGrpID - 2;		}
+		else if(rawGrpID == 1)
+		{
+			groups = &pos_01_group;
+			indexGrpID = 0;
+		}
+		else //rawGrpID == -1
+		{
+			groups = &neg_01_group;
+			indexGrpID = 0;			
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			exactData = exactData + medianValue;
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,4);
+			
+			groups[indexGrpID] = exactData;
+			
+			break;
+		default:
+			predValue = groups[indexGrpID]; //Here, groups[indexGrpID] is the previous value.
+			realGroupPrecision = groupErrorBounds[indexGrpID];
+			interval = realGroupPrecision*2;		
+			
+			curValue = predValue + (type_-exe_params->intvRadius)*interval;
+			
+			//groupNum = computeGroupNum_float(curValue);
+			
+			if((curValue>0&&rawGrpID<0)||(curValue<0&&rawGrpID>0))
+				curValue = 0;
+			//else
+			//{
+			//	realGrpID = fabs(rawGrpID)-2;
+			//	if(groupNum<realGrpID)
+			//		curValue = rawGrpID>0?pow(2,realGrpID):-pow(2,realGrpID);
+			//	else if(groupNum>realGrpID)
+			//		curValue = rawGrpID>0?pow(2,groupNum):-pow(2,groupNum);				
+			//}	
+				
+			(*data)[i] = curValue;
+			groups[indexGrpID] = curValue;
+			break;		
+		}
+	}	
+	
+	free(leadNum);
+	free(type);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupErrorBounds);
+	free(groupID);
+}
+
+void decompressDataSeries_float_1D_pwr_pre_log(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) {
+
+	decompressDataSeries_float_1D(data, dataSeriesLength, NULL, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_float_2D_pwr_pre_log(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_float_2D(data, r1, r2, NULL, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_float_3D_pwr_pre_log(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_float_3D(data, r1, r2, r3, NULL, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
+
+
+void decompressDataSeries_float_1D_pwr_pre_log_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	decompressDataSeries_float_1D_MSST19(data, dataSeriesLength, tdps);
+	float threshold = tdps->minLogValue;
+	uint32_t* ptr;
+
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs = NULL;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0){
+				(*data)[i] = 0;
+				continue;
+			}
+			if(signs[i]){
+			    ptr = (uint32_t*)(*data) + i;
+				*ptr |= 0x80000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+void decompressDataSeries_float_2D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_float_2D_MSST19(data, r1, r2, tdps);
+	float threshold = tdps->minLogValue;
+	uint32_t* ptr;
+
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0){
+				(*data)[i] = 0;
+				continue;
+			}
+			if(signs[i]){
+				ptr = (uint32_t*)(*data) + i;
+				*ptr |= 0x80000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+void decompressDataSeries_float_3D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_float_3D_MSST19(data, r1, r2, r3, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		uint32_t* ptr;
+		if(tdps->pwrErrBoundBytes_size==0)
+		{
+			signs = (unsigned char*)malloc(dataSeriesLength);
+			memset(signs, 0, dataSeriesLength);
+		}
+		else
+			sz_lossless_decompress(ZSTD_COMPRESSOR, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold && (*data)[i] >= 0) {
+			    (*data)[i] = 0;
+                continue;
+			}
+			if(signs[i]) {
+			    ptr = (uint32_t*)(*data)+i;
+			    *ptr |= 0x80000000;
+			}
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+		}
+	}
+}
+
+#pragma GCC diagnostic pop
diff --git a/deps/SZ/sz/src/szd_float_ts.c b/deps/SZ/sz/src/szd_float_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..b096a933ba3bfc0e10ebb30b36bd5aaf0ed08363
--- /dev/null
+++ b/deps/SZ/sz/src/szd_float_ts.c
@@ -0,0 +1,115 @@
+/**
+ *  @file szd_float_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_float.h"
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_float_ts.h"
+
+void decompressDataSeries_float_1D_ts(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps) 
+{
+	float* lastSnapshotData = hist_data;
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue = 0;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = (*data)[i-1];
+			if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				predValue = lastSnapshotData[i];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+	memcpy(hist_data, (*data), dataSeriesLength*sizeof(float));
+	
+	free(leadNum);
+	free(type);
+	return;
+}
diff --git a/deps/SZ/sz/src/szd_int16.c b/deps/SZ/sz/src/szd_int16.c
new file mode 100644
index 0000000000000000000000000000000000000000..b751c3f3dc968686ee3ffded667b1532914a0f31
--- /dev/null
+++ b/deps/SZ/sz/src/szd_int16.c
@@ -0,0 +1,924 @@
+/**
+ *  @file szd_int16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int16.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int16_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int16_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int16_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt16_bigEndian(p);
+		}		
+	}
+	else if(confparams_dec->sol_ID==SZ_Transpose)
+	{
+		getSnapshotData_int16_1D(newData,dataLength,tdps, errBoundMode);		
+	}
+	else //confparams_dec->sol_ID==SZ
+	{
+		if (dim == 1)
+			getSnapshotData_int16_1D(newData,r1,tdps, errBoundMode);
+		else
+		if (dim == 2)
+			getSnapshotData_int16_2D(newData,r2,r1,tdps, errBoundMode);
+		else
+		if (dim == 3)
+			getSnapshotData_int16_3D(newData,r3,r2,r1,tdps, errBoundMode);
+		else
+		if (dim == 4)
+			getSnapshotData_int16_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+		else
+		{
+			printf("Error: currently support only at most 4 dimensions!\n");
+			status = SZ_DERR;
+		}		
+	}	
+
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int16_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	int16_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[i] = SZ_INT16_MIN;
+			else
+				(*data)[i] = SZ_INT16_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			(*data)[1] = SZ_INT16_MIN;
+		else
+			(*data)[1] = SZ_INT16_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[jj] = SZ_INT16_MIN;
+			else
+				(*data)[jj] = SZ_INT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			(*data)[1] = SZ_INT16_MIN;
+		else
+			(*data)[1] = SZ_INT16_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[jj] = SZ_INT16_MIN;
+			else
+				(*data)[jj] = SZ_INT16_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_INT16_MIN)
+							(*data)[index] = SZ_INT16_MIN;
+						else
+							(*data)[index] = SZ_INT16_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt16_bigEndian(curBytes);
+						exactData = (uint16_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_int32.c b/deps/SZ/sz/src/szd_int32.c
new file mode 100644
index 0000000000000000000000000000000000000000..b5f31b09aba44de0a1cc1687cb07bd405f2136b1
--- /dev/null
+++ b/deps/SZ/sz/src/szd_int32.c
@@ -0,0 +1,789 @@
+/**
+ *  @file szd_int32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int32.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int32_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int32_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int32_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt32_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int32_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int32_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int32_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int32_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int32_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	int32_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int32_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int type_;
+
+	int32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt32_bigEndian(curBytes);
+						exactData = (uint32_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_int64.c b/deps/SZ/sz/src/szd_int64.c
new file mode 100644
index 0000000000000000000000000000000000000000..07a054f54a196f31fe6e9b3ab1eafc532cdad4cf
--- /dev/null
+++ b/deps/SZ/sz/src/szd_int64.c
@@ -0,0 +1,789 @@
+/**
+ *  @file szd_int64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int64.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int64_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int64_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int64_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt64_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int64_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int64_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int64_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int64_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int64_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	int64_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int64_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int type_;
+
+	int64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt64_bigEndian(curBytes);
+						exactData = (uint64_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_int8.c b/deps/SZ/sz/src/szd_int8.c
new file mode 100644
index 0000000000000000000000000000000000000000..850b4595b7501e8651c43efa3b48d77b5f4f12eb
--- /dev/null
+++ b/deps/SZ/sz/src/szd_int8.c
@@ -0,0 +1,913 @@
+/**
+ *  @file szd_int8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int8.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int8_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int8_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int8_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = *p;
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int8_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int8_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int8_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int8_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int8_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	int8_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (size_t i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[i] = SZ_INT8_MIN;
+			else
+				(*data)[i] = SZ_INT8_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			(*data)[1] = SZ_INT8_MIN;
+		else
+			(*data)[1] = SZ_INT8_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[jj] = SZ_INT8_MIN;
+			else
+				(*data)[jj] = SZ_INT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			(*data)[1] = SZ_INT8_MIN;
+		else
+			(*data)[1] = SZ_INT8_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[jj] = SZ_INT8_MIN;
+			else
+				(*data)[jj] = SZ_INT8_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_INT8_MIN)
+							(*data)[index] = SZ_INT8_MIN;
+						else
+							(*data)[index] = SZ_INT8_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = curBytes[0];
+						exactData = (uint8_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_uint16.c b/deps/SZ/sz/src/szd_uint16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ecf42302ab38dc85edb0cb4f2ff14fd4dbc9eb48
--- /dev/null
+++ b/deps/SZ/sz/src/szd_uint16.c
@@ -0,0 +1,922 @@
+/**
+ *  @file szd_uint16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint16.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint16_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint16_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint16_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt16_bigEndian(p);
+		}		
+	}
+	else if(confparams_dec->sol_ID==SZ_Transpose)
+	{
+		getSnapshotData_uint16_1D(newData,dataLength,tdps, errBoundMode);		
+	}
+	else //confparams_dec->sol_ID==SZ
+	{
+		if (dim == 1)
+			getSnapshotData_uint16_1D(newData,r1,tdps, errBoundMode);
+		else
+		if (dim == 2)
+			getSnapshotData_uint16_2D(newData,r2,r1,tdps, errBoundMode);
+		else
+		if (dim == 3)
+			getSnapshotData_uint16_3D(newData,r3,r2,r1,tdps, errBoundMode);
+		else
+		if (dim == 4)
+			getSnapshotData_uint16_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+		else
+		{
+			printf("Error: currently support only at most 4 dimensions!\n");
+			status = SZ_DERR;
+		}		
+	}	
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint16_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	uint16_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (size_t i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[i] = SZ_UINT16_MIN;
+			else
+				(*data)[i] = SZ_UINT16_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			(*data)[1] = SZ_UINT16_MIN;
+		else
+			(*data)[1] = SZ_UINT16_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[jj] = SZ_UINT16_MIN;
+			else
+				(*data)[jj] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			(*data)[1] = SZ_UINT16_MIN;
+		else
+			(*data)[1] = SZ_UINT16_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[jj] = SZ_UINT16_MIN;
+			else
+				(*data)[jj] = SZ_UINT16_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_UINT16_MIN)
+							(*data)[index] = SZ_UINT16_MIN;
+						else
+							(*data)[index] = SZ_UINT16_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt16_bigEndian(curBytes);
+						exactData = (uint16_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_uint32.c b/deps/SZ/sz/src/szd_uint32.c
new file mode 100644
index 0000000000000000000000000000000000000000..04e8049f9dc9f3a8cf6ba01aa0fc4bb691b4d735
--- /dev/null
+++ b/deps/SZ/sz/src/szd_uint32.c
@@ -0,0 +1,789 @@
+/**
+ *  @file szd_uint32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint32.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint32_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint32_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint32_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt32_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint32_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint32_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint32_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint32_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint32_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);	
+	return status;
+}
+
+
+void decompressDataSeries_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	uint32_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	uint32_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	uint32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	int type_;
+
+	uint32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt32_bigEndian(curBytes);
+						exactData = (uint32_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_uint64.c b/deps/SZ/sz/src/szd_uint64.c
new file mode 100644
index 0000000000000000000000000000000000000000..84d57168c4f7eed0bd49bf60cf8b8a3d19271b27
--- /dev/null
+++ b/deps/SZ/sz/src/szd_uint64.c
@@ -0,0 +1,789 @@
+/**
+ *  @file szd_uint64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint64.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint64_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint64_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint64_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt64_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint64_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint64_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint64_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint64_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint64_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	uint64_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	uint64_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	uint64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	int type_;
+
+	uint64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt64_bigEndian(curBytes);
+						exactData = (uint64_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szd_uint8.c b/deps/SZ/sz/src/szd_uint8.c
new file mode 100644
index 0000000000000000000000000000000000000000..8b992bc2d4d9400325936648479e2cf31151c5f4
--- /dev/null
+++ b/deps/SZ/sz/src/szd_uint8.c
@@ -0,0 +1,914 @@
+/**
+ *  @file szd_uint8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint8.h"
+#include "Huffman.h"
+#include "utility.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint8_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
+	{
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint8_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint8_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = *p;
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint8_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint8_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint8_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint8_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint8_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	uint8_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[i] = SZ_UINT8_MIN;
+			else
+				(*data)[i] = SZ_UINT8_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			(*data)[1] = SZ_UINT8_MIN;
+		else
+			(*data)[1] = SZ_UINT8_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[jj] = SZ_UINT8_MIN;
+			else
+				(*data)[jj] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			(*data)[1] = SZ_UINT8_MIN;
+		else
+			(*data)[1] = SZ_UINT8_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[jj] = SZ_UINT8_MIN;
+			else
+				(*data)[jj] = SZ_UINT8_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_UINT8_MIN)
+							(*data)[index] = SZ_UINT8_MIN;
+						else
+							(*data)[index] = SZ_UINT8_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = curBytes[0];
+						exactData = (uint8_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/deps/SZ/sz/src/szf.c b/deps/SZ/sz/src/szf.c
new file mode 100644
index 0000000000000000000000000000000000000000..a40dc38f8f6611a36e5908186ab6f6f04bcbe6f1
--- /dev/null
+++ b/deps/SZ/sz/src/szf.c
@@ -0,0 +1,570 @@
+/**
+ *  @file szf.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief the key C binding file to connect Fortran and C
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "sz.h"
+#include "szf.h"
+
+//special notice: all the function names in this file must be lower-cases!!
+void sz_init_c_(char *configFile,int *len,int *ierr)
+{
+    int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=configFile[i];
+    s2[*len]='\0';
+ //   printf("sconfigFile=%s\n",configFile);
+    *ierr = SZ_Init(s2);
+}
+
+void sz_finalize_c_()
+{
+	SZ_Finalize();
+}
+
+//compress with config (without args in function)
+void sz_compress_d1_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1)	
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);	
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1)	
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);	
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//compress with args
+
+void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//--------------
+
+void sz_compress_d1_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+}
+
+void sz_compress_d4_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//decompress
+
+void sz_decompress_d1_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1)
+{
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, 0, 0, *r1);
+	memcpy(data, tmp_data, (*r1)*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d2_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, 0, *r2, *r1);
+	r=(*r1)*(*r2);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d3_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d4_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d5_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, *r5, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4)*(*r5);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d1_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1)
+{
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, 0, 0, *r1);
+	memcpy(data, tmp_data, (*r1)*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d2_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, 0, *r2, *r1);
+	r=(*r1)*(*r2);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d3_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d4_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d5_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, *r5, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4)*(*r5);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+//-----------------TODO: batch mode-----------
+void sz_batchaddvar_d1_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
+}
+void sz_batchaddvar_d2_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
+}
+void sz_batchaddvar_d3_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d4_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d5_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d1_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
+}
+void sz_batchaddvar_d2_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
+}
+void sz_batchaddvar_d3_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d4_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d5_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(var_id, s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
+}
+void sz_batchdelvar_c_(char* varName, int *len, int *errState)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';
+	*errState = SZ_batchDelVar(s2);
+}
+
+/*@deprecated*/
+void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize)
+{
+	//unsigned char* tmp_bytes = SZ_batch_compress(outSize);
+	//memcpy(bytes, tmp_bytes, *outSize);
+	//free(tmp_bytes);
+}
+/*@deprecated*/
+void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr)
+{
+	//SZ_batch_decompress(bytes, *byteLength, ierr);
+}
+
+void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';
+    
+    SZ_getVarData(s2, r5, r4, r3, r2, r1);
+    *dim = computeDimension(*r5, *r4, *r3, *r2, *r1);
+}
+
+void compute_total_batch_size_c_(size_t *totalSize)
+{
+	*totalSize = compute_total_batch_size();
+}
+
+void sz_getvardata_float_(char* varName, int *len, float* data)
+{
+	int i;
+	size_t r1, r2, r3, r4, r5;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';	
+	
+	float* tmp_data = (float*)SZ_getVarData(s2, &r5, &r4, &r3, &r2, &r1);
+	int size = computeDataLength(r5, r4, r3, r2, r1);
+	memcpy(data, tmp_data, size*sizeof(float));
+	free(tmp_data);	
+}
+void sz_getvardata_double_(char* varName, int *len, double* data)
+{
+	int i;
+	size_t r1, r2, r3, r4, r5;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';	
+    
+	double* tmp_data = (double*)SZ_getVarData(s2, &r5, &r4, &r3, &r2, &r1);
+	int size = computeDataLength(r5, r4, r3, r2, r1);
+	memcpy(data, tmp_data, size*sizeof(double));
+	//free(tmp_data);
+}
+
+void sz_freevarset_c_(int *mode)
+{
+	SZ_freeVarSet(*mode);
+}
+
diff --git a/deps/SZ/sz/src/utility.c b/deps/SZ/sz/src/utility.c
new file mode 100644
index 0000000000000000000000000000000000000000..64788522064cdc724b6d60c3f0ec60583e790d61
--- /dev/null
+++ b/deps/SZ/sz/src/utility.c
@@ -0,0 +1,652 @@
+/**
+ *  @file utility.c
+ *  @author Sheng Di, Sihuan Li
+ *  @date Aug, 2018
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "utility.h"
+#include "sz.h"
+#include "callZlib.h"
+#include "zstd.h"
+
+int compare_struct(const void* obj1, const void* obj2){
+	struct sort_ast_particle * srt1 = (struct sort_ast_particle*)obj1;
+	struct sort_ast_particle * srt2 = (struct sort_ast_particle*)obj2;
+	return srt1->id - srt2->id;
+}
+
+void reorder_vars(SZ_VarSet* vset){
+	SZ_Variable* v[7];
+	SZ_Variable* v_tmp;
+	int i, j;
+	//v[0]
+	for (v_tmp = vset->header->next, i = 0; i < 7; i++){
+		v[i] = v_tmp;
+		v_tmp = v_tmp->next;
+	}
+	//printf("here");
+	size_t dataLen = computeDataLength(v[0]->r5, v[0]->r4, v[0]->r3, v[0]->r2, v[0]->r1);
+	//sihuan debug
+	//printf("the data length is (in sorting): %u", dataLen);
+	struct sort_ast_particle* particle = (struct sort_ast_particle*) malloc(sizeof(struct sort_ast_particle)*dataLen);
+
+	for (i = 0; i < dataLen; i++){
+		particle[i].id = ((int64_t*)v[6]->data)[i];
+	//	printf("%llu ", particle[i].id);
+		for (j = 0; j < 6; j++)
+			particle[i].var[j] = ((float*)v[j]->data)[i];
+	}
+
+	//sihuan debug
+	#if 0
+	printf("index before sorting: \n");
+	for (i = 0; i < 5; i++){
+		printf("%llu  ", particle[i].id);
+		printf("%.5f  ", ((float*)v[0]->data)[i]);
+	}
+	#endif
+	//printf("\n");
+	//sihuan debug
+	//for (i = 0; i < 5; i++)//{
+		//for (j = 0; j < 6; j++)
+		//	printf("%.5f  ", particle[i].var[j]);
+	//		printf("%llu  ", particle[i].id );
+	///}
+	//printf("\n\n");
+
+
+	qsort(particle, dataLen, sizeof(struct sort_ast_particle), compare_struct);
+	for (i = 0; i < dataLen; i++){
+		((int64_t*)v[6]->data)[i] = particle[i].id;
+		for (j = 0; j < 6; j++)
+			((float*)v[j]->data)[i] = particle[i].var[j];
+	}
+	free(particle);
+
+	//sihuan debug
+	#if 0
+	for (i = 0; i < 5; i++){
+		printf("%llu  ", particle[i].id);
+		printf("%.5f  ", ((float*)v[0]->data)[i]);
+	}
+	printf("\n");
+	#endif
+}
+
+size_t intersectAndsort(int64_t* preIndex, size_t preLen, SZ_VarSet* curVar, size_t dataLen, unsigned char* bitarray){
+	size_t i, j, k, m, cnt;
+	i = j = k = m = cnt = 0;
+	SZ_Variable* v[7];
+	SZ_Variable* v_tmp;
+	//v[0]
+	for (v_tmp = curVar->header->next, i = 0; i < 7; i++){
+		v[i] = v_tmp;
+		v_tmp = v_tmp->next;
+	}
+	for (i = 0; i < preLen; i++)
+		bitarray[i] = '0';
+	i = 0;
+	while(i < preLen && j < dataLen){
+		if (preIndex[i] == ((int64_t*)v[6]->data)[j]){
+			cnt++;
+			int64_t tmp;
+			tmp = ((int64_t*)v[6]->data)[k];
+			((int64_t*)v[6]->data)[k] = ((int64_t*)v[6]->data)[j];
+			((int64_t*)v[6]->data)[j] = tmp;
+			float data_tmp;
+			for (m = 0; m < 6; m++){
+				data_tmp = ((float*)v[m]->data)[k];
+				((float*)v[m]->data)[k] = ((float*)v[m]->data)[j];
+				((float*)v[m]->data)[j] = data_tmp;
+			}
+			k++; i++; j++;
+		}
+		else if (preIndex[i] < ((int64_t*)v[6]->data)[j]){
+			bitarray[i] = '1';
+			i++;
+		}
+		else j++;
+	}
+	printf("intersect count is: %zu, i j k pre curlen is: %zu, %zu, %zu, %zu, %zu\n\n", cnt, i, j, k, preLen, dataLen);
+	return cnt;
+}
+
+void write_reordered_tofile(SZ_VarSet* curVar, size_t dataLen){
+	int var_index; //0 for x, 1 for y...,3 for vx...5 for vz
+	int i;
+	char outputfile_name[256];
+	SZ_Variable* v[7]; SZ_Variable* v_tmp;
+	for (v_tmp = curVar->header->next, i = 0; i < 6; i++){
+		v[i] = v_tmp;
+		v_tmp = v_tmp->next;
+	}
+	for (var_index = 0; var_index < 6; var_index++){
+		sprintf(outputfile_name, "reordered_input_%d_%d.in", sz_tsc->currentStep, var_index);
+		int status_tmp;
+		writeFloatData_inBytes((float*)v[var_index]->data, dataLen, outputfile_name, &status_tmp);
+	}
+}
+
+float calculate_delta_t(size_t size){
+	SZ_Variable* v_tmp = sz_varset->header->next;
+	while(strcmp(v_tmp->varName, "x")) v_tmp = v_tmp->next;
+	float* x1 = (float*) v_tmp->data;
+	float* x0 = (float*) v_tmp->multisteps->hist_data;
+	while(strcmp(v_tmp->varName, "vx")) v_tmp = v_tmp->next;
+	float* vx0 = (float*) v_tmp->multisteps->hist_data;
+	int i, j;
+	double denom = 0.0;
+	double div = 0.0;
+	for (i = 0, j = 0; i < size; i++, j++){
+		while(sz_tsc->bit_array[j] == '1') j++;
+		denom += vx0[j] * (x1[i] - x0[j]);
+		div   += vx0[j] * vx0[j];
+	}
+	printf("the calculated delta_t is: %.10f\n", denom/div);
+	return denom/div;
+}
+
+int is_lossless_compressed_data(unsigned char* compressedBytes, size_t cmpSize)
+{
+#if ZSTD_VERSION_NUMBER >= 10300
+	unsigned long long frameContentSize = ZSTD_getFrameContentSize(compressedBytes, cmpSize);
+	if(frameContentSize != ZSTD_CONTENTSIZE_ERROR)
+		return ZSTD_COMPRESSOR;
+#else
+	unsigned long long frameContentSize = ZSTD_getDecompressedSize(compressedBytes, cmpSize);
+	if(frameContentSize != 0)
+		return ZSTD_COMPRESSOR;
+#endif
+	int flag = isZlibFormat(compressedBytes[0], compressedBytes[1]);
+	if(flag)
+		return GZIP_COMPRESSOR;
+
+	return -1; //fast mode (without GZIP or ZSTD)
+}
+
+unsigned long sz_lossless_compress(int losslessCompressor, int level, unsigned char* data, unsigned long dataLength, unsigned char** compressBytes)
+{
+	unsigned long outSize = 0; 
+	size_t estimatedCompressedSize = 0;
+	showme();
+	switch(losslessCompressor)
+	{
+	case GZIP_COMPRESSOR:
+		outSize = zlib_compress5(data, dataLength, compressBytes, level);
+		break;
+	case ZSTD_COMPRESSOR:
+		if(dataLength < 100) 
+			estimatedCompressedSize = 200;
+		else
+			estimatedCompressedSize = dataLength*1.2;
+		*compressBytes = (unsigned char*)malloc(estimatedCompressedSize);
+		outSize = ZSTD_compress(*compressBytes, estimatedCompressedSize, data, dataLength, level); //default setting of level is 3
+		break;
+	default:
+		printf("Error: Unrecognized lossless compressor in sz_lossless_compress()\n");
+	}
+	return outSize;
+}
+
+unsigned long sz_lossless_decompress(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	unsigned long outSize = 0;
+	switch(losslessCompressor)
+	{
+	case GZIP_COMPRESSOR:
+		outSize = zlib_uncompress5(compressBytes, cmpSize, oriData, targetOriSize);
+		break;
+	case ZSTD_COMPRESSOR:
+		*oriData = (unsigned char*)malloc(targetOriSize);
+		ZSTD_decompress(*oriData, targetOriSize, compressBytes, cmpSize);
+		outSize = targetOriSize;
+		break;
+	default:
+		printf("Error: Unrecognized lossless compressor in sz_lossless_decompress()\n");
+	}
+	return outSize;
+}
+
+unsigned long sz_lossless_decompress65536bytes(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData)
+{
+	unsigned long outSize = 0;
+	switch(losslessCompressor)
+	{
+	case GZIP_COMPRESSOR:
+		outSize = zlib_uncompress65536bytes(compressBytes, cmpSize, oriData);
+		break;
+	case ZSTD_COMPRESSOR:
+		*oriData = (unsigned char*)malloc(65536);
+		memset(*oriData, 0, 65536);
+		ZSTD_decompress(*oriData, 65536, compressBytes, cmpSize);	//the first 32768 bytes should be exact the same.
+		outSize = 65536;
+		break;
+	default:
+		printf("Error: Unrecognized lossless compressor\n");
+	}
+	return outSize;
+}
+
+void* detransposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t len = computeDataLength(r5, r4, r3, r2, r1);
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	if(dataType == SZ_FLOAT)
+	{
+		float* ori_data = data;
+		float* new_data = (float*)malloc(sizeof(float)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(float)*len);
+			return new_data;			
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					//size_t s = i*r1+j;
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];					
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;
+			size_t B = r1*r2;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+					for(k=0;k<r3;k++)
+					{
+						size_t t = k*B+i*r1+j;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t i, j, k, w, s = 0;
+			size_t C = r2*r1;
+			size_t B = r3*C;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+						for(w=0;w<r4;w++)
+						{
+							size_t t = w*B+i*C+j*r1+k;
+							new_data[t] = ori_data[s++];
+						}
+		}
+		return new_data;
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* ori_data = data;
+		double* new_data = (double*)malloc(sizeof(double)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(double)*len);
+			return new_data;			
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					//size_t s = i*r1+j;
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];					
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;
+			size_t B = r1*r2;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+					for(k=0;k<r3;k++)
+					{
+						size_t t = k*B+i*r1+j;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t i, j, k, w, s = 0;
+			size_t C = r2*r1;
+			size_t B = r3*C;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+						for(w=0;w<r4;w++)
+						{
+							size_t t = w*B+i*C+j*r1+k;
+							new_data[t] = ori_data[s++];
+						}
+		}
+		return new_data;		
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t* ori_data = data;
+		uint16_t* new_data = (uint16_t*)malloc(sizeof(uint16_t)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(uint16_t)*len);
+			return new_data;			
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					//size_t s = i*r1+j;
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];					
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;
+			size_t B = r1*r2;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+					for(k=0;k<r3;k++)
+					{
+						size_t t = k*B+i*r1+j;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t i, j, k, w, s = 0;
+			size_t C = r2*r1;
+			size_t B = r3*C;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+						for(w=0;w<r4;w++)
+						{
+							size_t t = w*B+i*C+j*r1+k;
+							new_data[t] = ori_data[s++];
+						}
+		}
+		return new_data;				
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t* ori_data = data;
+		int16_t* new_data = (int16_t*)malloc(sizeof(int16_t)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(int16_t)*len);
+			return new_data;			
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					//size_t s = i*r1+j;
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];					
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;
+			size_t B = r1*r2;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+					for(k=0;k<r3;k++)
+					{
+						size_t t = k*B+i*r1+j;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t i, j, k, w, s = 0;
+			size_t C = r2*r1;
+			size_t B = r3*C;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+						for(w=0;w<r4;w++)
+						{
+							size_t t = w*B+i*C+j*r1+k;
+							new_data[t] = ori_data[s++];
+						}
+		}
+		return new_data;			
+	}
+	else
+	{
+		return NULL;
+	}	
+}
+
+void* transposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t len = computeDataLength(r5, r4, r3, r2, r1);
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	if(dataType == SZ_FLOAT)
+	{
+		float* ori_data = data;
+		float* new_data = (float*)malloc(sizeof(float)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(float)*len);
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;			
+			//size_t B = r2*r1;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+					{
+						size_t jk = j*r1+k;
+						//size_t s = i*B+jk;
+						size_t t = r3*jk+i;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t C = r2*r1;
+			//size_t B = r3*C;
+			size_t D = C*r4;
+			size_t i, j, k, w, s = 0;			
+			for(i=0;i<r4;i++)
+				for(j=0;j<r3;j++)
+					for(k=0;k<r2;k++)
+						for(w=0;w<r1;w++)
+						{
+							size_t kw = k*r1+w;
+							//size_t s = i*B+j*C+kw;
+							size_t t = j*D+r4*kw+i;
+							new_data[t] = ori_data[s++];
+						}
+		}
+		return new_data;
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* ori_data = data;
+		double* new_data = (double*)malloc(sizeof(double)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(double)*len);
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;			
+			//size_t B = r2*r1;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+					{
+						size_t jk = j*r1+k;
+						//size_t s = i*B+jk;
+						size_t t = r3*jk+i;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t C = r2*r1;
+			//size_t B = r3*C;
+			size_t D = C*r4;
+			size_t i, j, k, w, s = 0;			
+			for(i=0;i<r4;i++)
+				for(j=0;j<r3;j++)
+					for(k=0;k<r2;k++)
+						for(w=0;w<r1;w++)
+						{
+							size_t kw = k*r1+w;
+							//size_t s = i*B+j*C+kw;
+							size_t t = j*D+r4*kw+i;
+							new_data[t] = ori_data[s++];	
+						}
+		}
+		return new_data;	
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t* ori_data = data;
+		uint16_t* new_data = (uint16_t*)malloc(sizeof(uint16_t)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(uint16_t)*len);
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;			
+			//size_t B = r2*r1;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+					{
+						size_t jk = j*r1+k;
+						//size_t s = i*B+jk;
+						size_t t = r3*jk+i;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t C = r2*r1;
+			//size_t B = r3*C;
+			size_t D = C*r4;
+			size_t i, j, k, w, s = 0;			
+			for(i=0;i<r4;i++)
+				for(j=0;j<r3;j++)
+					for(k=0;k<r2;k++)
+						for(w=0;w<r1;w++)
+						{
+							size_t kw = k*r1+w;
+							//size_t s = i*B+j*C+kw;
+							size_t t = j*D+r4*kw+i;
+							new_data[t] = ori_data[s++];	
+						}
+		}
+		return new_data;		
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t* ori_data = data;
+		int16_t* new_data = (int16_t*)malloc(sizeof(int16_t)*len);
+		if(dim==1)
+		{
+			memcpy(new_data, ori_data, sizeof(int16_t)*len);
+		}
+		else if(dim==2)
+		{
+			size_t i, j, s = 0;
+			for(i=0;i<r2;i++)
+				for(j=0;j<r1;j++)
+				{
+					size_t t = j*r2+i;
+					new_data[t] = ori_data[s++];
+				}
+		}
+		else if(dim==3)
+		{
+			size_t i, j, k, s = 0;			
+			//size_t B = r2*r1;
+			for(i=0;i<r3;i++)
+				for(j=0;j<r2;j++)
+					for(k=0;k<r1;k++)
+					{
+						size_t jk = j*r1+k;
+						//size_t s = i*B+jk;
+						size_t t = r3*jk+i;
+						new_data[t] = ori_data[s++];
+					}
+		}
+		else if(dim==4)
+		{
+			size_t C = r2*r1;
+			//size_t B = r3*C;
+			size_t D = C*r4;
+			size_t i, j, k, w, s = 0;			
+			for(i=0;i<r4;i++)
+				for(j=0;j<r3;j++)
+					for(k=0;k<r2;k++)
+						for(w=0;w<r1;w++)
+						{
+							size_t kw = k*r1+w;
+							//size_t s = i*B+j*C+kw;
+							size_t t = j*D+r4*kw+i;
+							new_data[t] = ori_data[s++];	
+						}
+		}
+		return new_data;		
+	}	
+	else
+	{
+		printf("Error. transpose data doesn't support data type %d\n", dataType);
+		return NULL;
+	}
+}
diff --git a/deps/SZ/zlib/CMakeLists.txt b/deps/SZ/zlib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79306ea17c65aeeaffbe8af401daf632fa0e1294
--- /dev/null
+++ b/deps/SZ/zlib/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_library(ZLIB 
+  ./gzclose.c
+  ./uncompr.c
+  ./trees.c
+  ./gzread.c
+  ./adler32.c
+  ./gzwrite.c
+  ./compress.c
+  ./inftrees.c
+  ./crc32.c
+  ./inffast.c
+  ./zutil.c
+  ./gzlib.c
+  ./infback.c
+  ./inflate.c
+  ./deflate.c
+  )
+
+target_include_directories(ZLIB
+  PUBLIC 
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+
+
diff --git a/deps/SZ/zlib/Makefile.am b/deps/SZ/zlib/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..1ad34fd9065130bcf5d8abe52bfb8ad193d04457
--- /dev/null
+++ b/deps/SZ/zlib/Makefile.am
@@ -0,0 +1,6 @@
+AUTOMAKE_OPTIONS=foreign
+include_HEADERS=inffixed.h inflate.h inftrees.h trees.h zconf.h crc32.h deflate.h gzguts.h inffast.h zlib.h zutil.h
+lib_LTLIBRARIES=libzlib.la
+libzlib_la_CFLAGS=-I./
+libzlib_la_SOURCES=adler32.c crc32.c deflate.c gzclose.c gzlib.c gzwrite.c inffast.c zutil.c compress.c \
+		gzread.c infback.c inflate.c inftrees.c trees.c uncompr.c  
diff --git a/deps/SZ/zlib/Makefile.in b/deps/SZ/zlib/Makefile.in
new file mode 100644
index 0000000000000000000000000000000000000000..a0327365f124057ebfe2bdf851a121fe1933ca5b
--- /dev/null
+++ b/deps/SZ/zlib/Makefile.in
@@ -0,0 +1,864 @@
+# Makefile.in generated by automake 1.16.2 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2020 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = zlib
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(include_HEADERS) \
+	$(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libzlib_la_LIBADD =
+am_libzlib_la_OBJECTS = libzlib_la-adler32.lo libzlib_la-crc32.lo \
+	libzlib_la-deflate.lo libzlib_la-gzclose.lo \
+	libzlib_la-gzlib.lo libzlib_la-gzwrite.lo \
+	libzlib_la-inffast.lo libzlib_la-zutil.lo \
+	libzlib_la-compress.lo libzlib_la-gzread.lo \
+	libzlib_la-infback.lo libzlib_la-inflate.lo \
+	libzlib_la-inftrees.lo libzlib_la-trees.lo \
+	libzlib_la-uncompr.lo
+libzlib_la_OBJECTS = $(am_libzlib_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libzlib_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libzlib_la_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = ./$(DEPDIR)/libzlib_la-adler32.Plo \
+	./$(DEPDIR)/libzlib_la-compress.Plo \
+	./$(DEPDIR)/libzlib_la-crc32.Plo \
+	./$(DEPDIR)/libzlib_la-deflate.Plo \
+	./$(DEPDIR)/libzlib_la-gzclose.Plo \
+	./$(DEPDIR)/libzlib_la-gzlib.Plo \
+	./$(DEPDIR)/libzlib_la-gzread.Plo \
+	./$(DEPDIR)/libzlib_la-gzwrite.Plo \
+	./$(DEPDIR)/libzlib_la-infback.Plo \
+	./$(DEPDIR)/libzlib_la-inffast.Plo \
+	./$(DEPDIR)/libzlib_la-inflate.Plo \
+	./$(DEPDIR)/libzlib_la-inftrees.Plo \
+	./$(DEPDIR)/libzlib_la-trees.Plo \
+	./$(DEPDIR)/libzlib_la-uncompr.Plo \
+	./$(DEPDIR)/libzlib_la-zutil.Plo
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libzlib_la_SOURCES)
+DIST_SOURCES = $(libzlib_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+HEADERS = $(include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FC = @FC@
+FCFLAGS = @FCFLAGS@
+FGREP = @FGREP@
+GREP = @GREP@
+GSL_CFLAGS = @GSL_CFLAGS@
+GSL_CONFIG = @GSL_CONFIG@
+GSL_HDR = @GSL_HDR@
+GSL_LIB = @GSL_LIB@
+GSL_LIBS = @GSL_LIBS@
+GSL_STATIC = @GSL_STATIC@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OPENMP_FLAGS = @OPENMP_FLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PASTRI_FLAGS = @PASTRI_FLAGS@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANDOMACCESS_FLAGS = @RANDOMACCESS_FLAGS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TIMECMPR_FLAGS = @TIMECMPR_FLAGS@
+VERSION = @VERSION@
+WRITESTATS_FLAGS = @WRITESTATS_FLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_FC = @ac_ct_FC@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AUTOMAKE_OPTIONS = foreign
+include_HEADERS = inffixed.h inflate.h inftrees.h trees.h zconf.h crc32.h deflate.h gzguts.h inffast.h zlib.h zutil.h
+lib_LTLIBRARIES = libzlib.la
+libzlib_la_CFLAGS = -I./
+libzlib_la_SOURCES = adler32.c crc32.c deflate.c gzclose.c gzlib.c gzwrite.c inffast.c zutil.c compress.c \
+		gzread.c infback.c inflate.c inftrees.c trees.c uncompr.c  
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign zlib/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign zlib/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libzlib.la: $(libzlib_la_OBJECTS) $(libzlib_la_DEPENDENCIES) $(EXTRA_libzlib_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libzlib_la_LINK) -rpath $(libdir) $(libzlib_la_OBJECTS) $(libzlib_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-adler32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-crc32.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-deflate.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-gzclose.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-gzlib.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-gzread.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-gzwrite.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-infback.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-inffast.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-inflate.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-inftrees.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-trees.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-uncompr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libzlib_la-zutil.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+libzlib_la-adler32.lo: adler32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-adler32.lo -MD -MP -MF $(DEPDIR)/libzlib_la-adler32.Tpo -c -o libzlib_la-adler32.lo `test -f 'adler32.c' || echo '$(srcdir)/'`adler32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-adler32.Tpo $(DEPDIR)/libzlib_la-adler32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='adler32.c' object='libzlib_la-adler32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-adler32.lo `test -f 'adler32.c' || echo '$(srcdir)/'`adler32.c
+
+libzlib_la-crc32.lo: crc32.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-crc32.lo -MD -MP -MF $(DEPDIR)/libzlib_la-crc32.Tpo -c -o libzlib_la-crc32.lo `test -f 'crc32.c' || echo '$(srcdir)/'`crc32.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-crc32.Tpo $(DEPDIR)/libzlib_la-crc32.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='crc32.c' object='libzlib_la-crc32.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-crc32.lo `test -f 'crc32.c' || echo '$(srcdir)/'`crc32.c
+
+libzlib_la-deflate.lo: deflate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-deflate.lo -MD -MP -MF $(DEPDIR)/libzlib_la-deflate.Tpo -c -o libzlib_la-deflate.lo `test -f 'deflate.c' || echo '$(srcdir)/'`deflate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-deflate.Tpo $(DEPDIR)/libzlib_la-deflate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='deflate.c' object='libzlib_la-deflate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-deflate.lo `test -f 'deflate.c' || echo '$(srcdir)/'`deflate.c
+
+libzlib_la-gzclose.lo: gzclose.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-gzclose.lo -MD -MP -MF $(DEPDIR)/libzlib_la-gzclose.Tpo -c -o libzlib_la-gzclose.lo `test -f 'gzclose.c' || echo '$(srcdir)/'`gzclose.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-gzclose.Tpo $(DEPDIR)/libzlib_la-gzclose.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gzclose.c' object='libzlib_la-gzclose.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-gzclose.lo `test -f 'gzclose.c' || echo '$(srcdir)/'`gzclose.c
+
+libzlib_la-gzlib.lo: gzlib.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-gzlib.lo -MD -MP -MF $(DEPDIR)/libzlib_la-gzlib.Tpo -c -o libzlib_la-gzlib.lo `test -f 'gzlib.c' || echo '$(srcdir)/'`gzlib.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-gzlib.Tpo $(DEPDIR)/libzlib_la-gzlib.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gzlib.c' object='libzlib_la-gzlib.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-gzlib.lo `test -f 'gzlib.c' || echo '$(srcdir)/'`gzlib.c
+
+libzlib_la-gzwrite.lo: gzwrite.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-gzwrite.lo -MD -MP -MF $(DEPDIR)/libzlib_la-gzwrite.Tpo -c -o libzlib_la-gzwrite.lo `test -f 'gzwrite.c' || echo '$(srcdir)/'`gzwrite.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-gzwrite.Tpo $(DEPDIR)/libzlib_la-gzwrite.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gzwrite.c' object='libzlib_la-gzwrite.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-gzwrite.lo `test -f 'gzwrite.c' || echo '$(srcdir)/'`gzwrite.c
+
+libzlib_la-inffast.lo: inffast.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-inffast.lo -MD -MP -MF $(DEPDIR)/libzlib_la-inffast.Tpo -c -o libzlib_la-inffast.lo `test -f 'inffast.c' || echo '$(srcdir)/'`inffast.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-inffast.Tpo $(DEPDIR)/libzlib_la-inffast.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='inffast.c' object='libzlib_la-inffast.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-inffast.lo `test -f 'inffast.c' || echo '$(srcdir)/'`inffast.c
+
+libzlib_la-zutil.lo: zutil.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-zutil.lo -MD -MP -MF $(DEPDIR)/libzlib_la-zutil.Tpo -c -o libzlib_la-zutil.lo `test -f 'zutil.c' || echo '$(srcdir)/'`zutil.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-zutil.Tpo $(DEPDIR)/libzlib_la-zutil.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='zutil.c' object='libzlib_la-zutil.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-zutil.lo `test -f 'zutil.c' || echo '$(srcdir)/'`zutil.c
+
+libzlib_la-compress.lo: compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-compress.lo -MD -MP -MF $(DEPDIR)/libzlib_la-compress.Tpo -c -o libzlib_la-compress.lo `test -f 'compress.c' || echo '$(srcdir)/'`compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-compress.Tpo $(DEPDIR)/libzlib_la-compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='compress.c' object='libzlib_la-compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-compress.lo `test -f 'compress.c' || echo '$(srcdir)/'`compress.c
+
+libzlib_la-gzread.lo: gzread.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-gzread.lo -MD -MP -MF $(DEPDIR)/libzlib_la-gzread.Tpo -c -o libzlib_la-gzread.lo `test -f 'gzread.c' || echo '$(srcdir)/'`gzread.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-gzread.Tpo $(DEPDIR)/libzlib_la-gzread.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='gzread.c' object='libzlib_la-gzread.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-gzread.lo `test -f 'gzread.c' || echo '$(srcdir)/'`gzread.c
+
+libzlib_la-infback.lo: infback.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-infback.lo -MD -MP -MF $(DEPDIR)/libzlib_la-infback.Tpo -c -o libzlib_la-infback.lo `test -f 'infback.c' || echo '$(srcdir)/'`infback.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-infback.Tpo $(DEPDIR)/libzlib_la-infback.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='infback.c' object='libzlib_la-infback.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-infback.lo `test -f 'infback.c' || echo '$(srcdir)/'`infback.c
+
+libzlib_la-inflate.lo: inflate.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-inflate.lo -MD -MP -MF $(DEPDIR)/libzlib_la-inflate.Tpo -c -o libzlib_la-inflate.lo `test -f 'inflate.c' || echo '$(srcdir)/'`inflate.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-inflate.Tpo $(DEPDIR)/libzlib_la-inflate.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='inflate.c' object='libzlib_la-inflate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-inflate.lo `test -f 'inflate.c' || echo '$(srcdir)/'`inflate.c
+
+libzlib_la-inftrees.lo: inftrees.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-inftrees.lo -MD -MP -MF $(DEPDIR)/libzlib_la-inftrees.Tpo -c -o libzlib_la-inftrees.lo `test -f 'inftrees.c' || echo '$(srcdir)/'`inftrees.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-inftrees.Tpo $(DEPDIR)/libzlib_la-inftrees.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='inftrees.c' object='libzlib_la-inftrees.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-inftrees.lo `test -f 'inftrees.c' || echo '$(srcdir)/'`inftrees.c
+
+libzlib_la-trees.lo: trees.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-trees.lo -MD -MP -MF $(DEPDIR)/libzlib_la-trees.Tpo -c -o libzlib_la-trees.lo `test -f 'trees.c' || echo '$(srcdir)/'`trees.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-trees.Tpo $(DEPDIR)/libzlib_la-trees.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='trees.c' object='libzlib_la-trees.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-trees.lo `test -f 'trees.c' || echo '$(srcdir)/'`trees.c
+
+libzlib_la-uncompr.lo: uncompr.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -MT libzlib_la-uncompr.lo -MD -MP -MF $(DEPDIR)/libzlib_la-uncompr.Tpo -c -o libzlib_la-uncompr.lo `test -f 'uncompr.c' || echo '$(srcdir)/'`uncompr.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libzlib_la-uncompr.Tpo $(DEPDIR)/libzlib_la-uncompr.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='uncompr.c' object='libzlib_la-uncompr.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzlib_la_CFLAGS) $(CFLAGS) -c -o libzlib_la-uncompr.lo `test -f 'uncompr.c' || echo '$(srcdir)/'`uncompr.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f ./$(DEPDIR)/libzlib_la-adler32.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-compress.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-crc32.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-deflate.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzclose.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzlib.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzread.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzwrite.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-infback.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inffast.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inflate.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inftrees.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-trees.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-uncompr.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-zutil.Plo
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f ./$(DEPDIR)/libzlib_la-adler32.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-compress.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-crc32.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-deflate.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzclose.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzlib.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzread.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-gzwrite.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-infback.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inffast.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inflate.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-inftrees.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-trees.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-uncompr.Plo
+	-rm -f ./$(DEPDIR)/libzlib_la-zutil.Plo
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-libLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libLTLIBRARIES clean-libtool cscopelist-am \
+	ctags ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am \
+	install-includeHEADERS install-info install-info-am \
+	install-libLTLIBRARIES install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am uninstall-includeHEADERS \
+	uninstall-libLTLIBRARIES
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/deps/SZ/zlib/adler32.c b/deps/SZ/zlib/adler32.c
new file mode 100644
index 0000000000000000000000000000000000000000..d0be4380a39c9c5bf439b1552c43585b5aafad0a
--- /dev/null
+++ b/deps/SZ/zlib/adler32.c
@@ -0,0 +1,186 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#include "zutil.h"
+
+local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(buf,i)  {adler += (buf)[i]; sum2 += adler;}
+#define DO2(buf,i)  DO1(buf,i); DO1(buf,i+1);
+#define DO4(buf,i)  DO2(buf,i); DO2(buf,i+2);
+#define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
+#define DO16(buf)   DO8(buf,0); DO8(buf,8);
+
+/* use NO_DIVIDE if your processor does not do division in hardware --
+   try it both ways to see which is faster */
+#ifdef NO_DIVIDE
+/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
+   (thank you to John Reiser for pointing this out) */
+#  define CHOP(a) \
+    do { \
+        unsigned long tmp = a >> 16; \
+        a &= 0xffffUL; \
+        a += (tmp << 4) - tmp; \
+    } while (0)
+#  define MOD28(a) \
+    do { \
+        CHOP(a); \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#  define MOD(a) \
+    do { \
+        CHOP(a); \
+        MOD28(a); \
+    } while (0)
+#  define MOD63(a) \
+    do { /* this assumes a is not negative */ \
+        z_off64_t tmp = a >> 32; \
+        a &= 0xffffffffL; \
+        a += (tmp << 8) - (tmp << 5) + tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#else
+#  define MOD(a) a %= BASE
+#  define MOD28(a) a %= BASE
+#  define MOD63(a) a %= BASE
+#endif
+
+/* ========================================================================= */
+uLong ZEXPORT adler32_z(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    z_size_t len;
+{
+    unsigned long sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1) {
+        adler += buf[0];
+        if (adler >= BASE)
+            adler -= BASE;
+        sum2 += adler;
+        if (sum2 >= BASE)
+            sum2 -= BASE;
+        return adler | (sum2 << 16);
+    }
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == Z_NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16) {
+        while (len--) {
+            adler += *buf++;
+            sum2 += adler;
+        }
+        if (adler >= BASE)
+            adler -= BASE;
+        MOD28(sum2);            /* only added so many BASE's */
+        return adler | (sum2 << 16);
+    }
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+        do {
+            DO16(buf);          /* 16 sums unrolled */
+            buf += 16;
+        } while (--n);
+        MOD(adler);
+        MOD(sum2);
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    if (len) {                  /* avoid modulos if none remaining */
+        while (len >= 16) {
+            len -= 16;
+            DO16(buf);
+            buf += 16;
+        }
+        while (len--) {
+            adler += *buf++;
+            sum2 += adler;
+        }
+        MOD(adler);
+        MOD(sum2);
+    }
+
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+/* ========================================================================= */
+uLong ZEXPORT adler32(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    return adler32_z(adler, buf, len);
+}
+
+/* ========================================================================= */
+local uLong adler32_combine_(adler1, adler2, len2)
+    uLong adler1;
+    uLong adler2;
+    z_off64_t len2;
+{
+    unsigned long sum1;
+    unsigned long sum2;
+    unsigned rem;
+
+    /* for negative len, return invalid adler32 as a clue for debugging */
+    if (len2 < 0)
+        return 0xffffffffUL;
+
+    /* the derivation of this formula is left as an exercise for the reader */
+    MOD63(len2);                /* assumes len2 >= 0 */
+    rem = (unsigned)len2;
+    sum1 = adler1 & 0xffff;
+    sum2 = rem * sum1;
+    MOD(sum2);
+    sum1 += (adler2 & 0xffff) + BASE - 1;
+    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
+    if (sum2 >= BASE) sum2 -= BASE;
+    return sum1 | (sum2 << 16);
+}
+
+/* ========================================================================= */
+uLong ZEXPORT adler32_combine(adler1, adler2, len2)
+    uLong adler1;
+    uLong adler2;
+    z_off_t len2;
+{
+    return adler32_combine_(adler1, adler2, len2);
+}
+
+uLong ZEXPORT adler32_combine64(adler1, adler2, len2)
+    uLong adler1;
+    uLong adler2;
+    z_off64_t len2;
+{
+    return adler32_combine_(adler1, adler2, len2);
+}
diff --git a/deps/SZ/zlib/compress.c b/deps/SZ/zlib/compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..e2db404abf888bd2c85844985b5ae9784b955c63
--- /dev/null
+++ b/deps/SZ/zlib/compress.c
@@ -0,0 +1,86 @@
+/* compress.c -- compress a memory buffer
+ * Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#define ZLIB_INTERNAL
+#include "zlib.h"
+
+/* ===========================================================================
+     Compresses the source buffer into the destination buffer. The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer. Upon entry, destLen is the total size of the
+   destination buffer, which must be at least 0.1% larger than sourceLen plus
+   12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+int ZEXPORT compress2 (dest, destLen, source, sourceLen, level)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong sourceLen;
+    int level;
+{
+    z_stream stream;
+    int err;
+    const uInt max = (uInt)-1;
+    uLong left;
+
+    left = *destLen;
+    *destLen = 0;
+
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    err = deflateInit(&stream, level);
+    if (err != Z_OK) return err;
+
+    stream.next_out = dest;
+    stream.avail_out = 0;
+    stream.next_in = (z_const Bytef *)source;
+    stream.avail_in = 0;
+
+    do {
+        if (stream.avail_out == 0) {
+            stream.avail_out = left > (uLong)max ? max : (uInt)left;
+            left -= stream.avail_out;
+        }
+        if (stream.avail_in == 0) {
+            stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen;
+            sourceLen -= stream.avail_in;
+        }
+        err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
+    } while (err == Z_OK);
+
+    *destLen = stream.total_out;
+    deflateEnd(&stream);
+    return err == Z_STREAM_END ? Z_OK : err;
+}
+
+/* ===========================================================================
+ */
+int ZEXPORT compress (dest, destLen, source, sourceLen)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong sourceLen;
+{
+    return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
+}
+
+/* ===========================================================================
+     If the default memLevel or windowBits for deflateInit() is changed, then
+   this function needs to be updated.
+ */
+uLong ZEXPORT compressBound (sourceLen)
+    uLong sourceLen;
+{
+    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
+           (sourceLen >> 25) + 13;
+}
diff --git a/deps/SZ/zlib/crc32.c b/deps/SZ/zlib/crc32.c
new file mode 100644
index 0000000000000000000000000000000000000000..9580440c0e6b673c43e57daab03274ebdca8f77e
--- /dev/null
+++ b/deps/SZ/zlib/crc32.c
@@ -0,0 +1,442 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+  protection on the static variables used to control the first-use generation
+  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+  first call get_crc_table() to initialize the tables before allowing more than
+  one thread to use crc32().
+
+  DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h.
+ */
+
+#ifdef MAKECRCH
+#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+#include "zutil.h"      /* for STDC and FAR definitions */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#if !defined(NOBYFOUR) && defined(Z_U4)
+#  define BYFOUR
+#endif
+#ifdef BYFOUR
+   local unsigned long crc32_little OF((unsigned long,
+                        const unsigned char FAR *, z_size_t));
+   local unsigned long crc32_big OF((unsigned long,
+                        const unsigned char FAR *, z_size_t));
+#  define TBLS 8
+#else
+#  define TBLS 1
+#endif /* BYFOUR */
+
+/* Local functions for crc concatenation */
+local unsigned long gf2_matrix_times OF((unsigned long *mat,
+                                         unsigned long vec));
+local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
+local uLong crc32_combine_ OF((uLong crc1, uLong crc2, z_off64_t len2));
+
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local z_crc_t FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+   local void write_table OF((FILE *, const z_crc_t FAR *));
+#endif /* MAKECRCH */
+/*
+  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+  Polynomials over GF(2) are represented in binary, one bit per coefficient,
+  with the lowest powers in the most significant bit.  Then adding polynomials
+  is just exclusive-or, and multiplying a polynomial by x is a right shift by
+  one.  If we call the above polynomial p, and represent a byte as the
+  polynomial q, also with the lowest power in the most significant bit (so the
+  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+  where a mod b means the remainder after dividing a by b.
+
+  This calculation is done using the shift-register method of multiplying and
+  taking the remainder.  The register is initialized to zero, and for each
+  incoming bit, x^32 is added mod p to the register if the bit is a one (where
+  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+  x (which is shifting right by one and adding x^32 mod p if the bit shifted
+  out is a one).  We start with the highest power (least significant bit) of
+  q and repeat for all eight bits of q.
+
+  The first table is simply the CRC of all possible eight bit values.  This is
+  all the information needed to generate CRCs on data a byte at a time for all
+  combinations of CRC register values and incoming bytes.  The remaining tables
+  allow for word-at-a-time CRC calculation for both big-endian and little-
+  endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+    z_crc_t c;
+    int n, k;
+    z_crc_t poly;                       /* polynomial exclusive-or pattern */
+    /* terms of polynomial defining this crc (except x^32): */
+    static volatile int first = 1;      /* flag to limit concurrent making */
+    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* See if another task is already doing this (not thread-safe, but better
+       than nothing -- significantly reduces duration of vulnerability in
+       case the advice about DYNAMIC_CRC_TABLE is ignored) */
+    if (first) {
+        first = 0;
+
+        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+        poly = 0;
+        for (n = 0; n < (int)(sizeof(p)/sizeof(unsigned char)); n++)
+            poly |= (z_crc_t)1 << (31 - p[n]);
+
+        /* generate a crc for every 8-bit value */
+        for (n = 0; n < 256; n++) {
+            c = (z_crc_t)n;
+            for (k = 0; k < 8; k++)
+                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+            crc_table[0][n] = c;
+        }
+
+#ifdef BYFOUR
+        /* generate crc for each value followed by one, two, and three zeros,
+           and then the byte reversal of those as well as the first table */
+        for (n = 0; n < 256; n++) {
+            c = crc_table[0][n];
+            crc_table[4][n] = ZSWAP32(c);
+            for (k = 1; k < 4; k++) {
+                c = crc_table[0][c & 0xff] ^ (c >> 8);
+                crc_table[k][n] = c;
+                crc_table[k + 4][n] = ZSWAP32(c);
+            }
+        }
+#endif /* BYFOUR */
+
+        crc_table_empty = 0;
+    }
+    else {      /* not first */
+        /* wait for the other guy to finish (not efficient, but rare) */
+        while (crc_table_empty)
+            ;
+    }
+
+#ifdef MAKECRCH
+    /* write out CRC tables to crc32.h */
+    {
+        FILE *out;
+
+        out = fopen("crc32.h", "w");
+        if (out == NULL) return;
+        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+        fprintf(out, "local const z_crc_t FAR ");
+        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
+        write_table(out, crc_table[0]);
+#  ifdef BYFOUR
+        fprintf(out, "#ifdef BYFOUR\n");
+        for (k = 1; k < 8; k++) {
+            fprintf(out, "  },\n  {\n");
+            write_table(out, crc_table[k]);
+        }
+        fprintf(out, "#endif\n");
+#  endif /* BYFOUR */
+        fprintf(out, "  }\n};\n");
+        fclose(out);
+    }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+    FILE *out;
+    const z_crc_t FAR *table;
+{
+    int n;
+
+    for (n = 0; n < 256; n++)
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ",
+                (unsigned long)(table[n]),
+                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const z_crc_t FAR * ZEXPORT get_crc_table()
+{
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+    return (const z_crc_t FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32_z(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    z_size_t len;
+{
+    if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+    if (sizeof(void *) == sizeof(ptrdiff_t)) {
+        z_crc_t endian;
+
+        endian = 1;
+        if (*((unsigned char *)(&endian)))
+            return crc32_little(crc, buf, len);
+        else
+            return crc32_big(crc, buf, len);
+    }
+#endif /* BYFOUR */
+    crc = crc ^ 0xffffffffUL;
+    while (len >= 8) {
+        DO8;
+        len -= 8;
+    }
+    if (len) do {
+        DO1;
+    } while (--len);
+    return crc ^ 0xffffffffUL;
+}
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    uInt len;
+{
+    return crc32_z(crc, buf, len);
+}
+
+#ifdef BYFOUR
+
+/*
+   This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
+   integer pointer type. This violates the strict aliasing rule, where a
+   compiler can assume, for optimization purposes, that two pointers to
+   fundamentally different types won't ever point to the same memory. This can
+   manifest as a problem only if one of the pointers is written to. This code
+   only reads from those pointers. So long as this code remains isolated in
+   this compilation unit, there won't be a problem. For this reason, this code
+   should not be copied and pasted into a compilation unit in which other code
+   writes to the buffer that is passed to these routines.
+ */
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    z_size_t len;
+{
+    register z_crc_t c;
+    register const z_crc_t FAR *buf4;
+
+    c = (z_crc_t)crc;
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+        len--;
+    }
+
+    buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
+    while (len >= 32) {
+        DOLIT32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOLIT4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *buf4++; \
+        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    z_size_t len;
+{
+    register z_crc_t c;
+    register const z_crc_t FAR *buf4;
+
+    c = ZSWAP32((z_crc_t)crc);
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+        len--;
+    }
+
+    buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
+    while (len >= 32) {
+        DOBIG32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOBIG4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)(ZSWAP32(c));
+}
+
+#endif /* BYFOUR */
+
+#define GF2_DIM 32      /* dimension of GF(2) vectors (length of CRC) */
+
+/* ========================================================================= */
+local unsigned long gf2_matrix_times(mat, vec)
+    unsigned long *mat;
+    unsigned long vec;
+{
+    unsigned long sum;
+
+    sum = 0;
+    while (vec) {
+        if (vec & 1)
+            sum ^= *mat;
+        vec >>= 1;
+        mat++;
+    }
+    return sum;
+}
+
+/* ========================================================================= */
+local void gf2_matrix_square(square, mat)
+    unsigned long *square;
+    unsigned long *mat;
+{
+    int n;
+
+    for (n = 0; n < GF2_DIM; n++)
+        square[n] = gf2_matrix_times(mat, mat[n]);
+}
+
+/* ========================================================================= */
+local uLong crc32_combine_(crc1, crc2, len2)
+    uLong crc1;
+    uLong crc2;
+    z_off64_t len2;
+{
+    int n;
+    unsigned long row;
+    unsigned long even[GF2_DIM];    /* even-power-of-two zeros operator */
+    unsigned long odd[GF2_DIM];     /* odd-power-of-two zeros operator */
+
+    /* degenerate case (also disallow negative lengths) */
+    if (len2 <= 0)
+        return crc1;
+
+    /* put operator for one zero bit in odd */
+    odd[0] = 0xedb88320UL;          /* CRC-32 polynomial */
+    row = 1;
+    for (n = 1; n < GF2_DIM; n++) {
+        odd[n] = row;
+        row <<= 1;
+    }
+
+    /* put operator for two zero bits in even */
+    gf2_matrix_square(even, odd);
+
+    /* put operator for four zero bits in odd */
+    gf2_matrix_square(odd, even);
+
+    /* apply len2 zeros to crc1 (first square will put the operator for one
+       zero byte, eight zero bits, in even) */
+    do {
+        /* apply zeros operator for this bit of len2 */
+        gf2_matrix_square(even, odd);
+        if (len2 & 1)
+            crc1 = gf2_matrix_times(even, crc1);
+        len2 >>= 1;
+
+        /* if no more bits set, then done */
+        if (len2 == 0)
+            break;
+
+        /* another iteration of the loop with odd and even swapped */
+        gf2_matrix_square(odd, even);
+        if (len2 & 1)
+            crc1 = gf2_matrix_times(odd, crc1);
+        len2 >>= 1;
+
+        /* if no more bits set, then done */
+    } while (len2 != 0);
+
+    /* return combined crc */
+    crc1 ^= crc2;
+    return crc1;
+}
+
+/* ========================================================================= */
+uLong ZEXPORT crc32_combine(crc1, crc2, len2)
+    uLong crc1;
+    uLong crc2;
+    z_off_t len2;
+{
+    return crc32_combine_(crc1, crc2, len2);
+}
+
+uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
+    uLong crc1;
+    uLong crc2;
+    z_off64_t len2;
+{
+    return crc32_combine_(crc1, crc2, len2);
+}
diff --git a/deps/SZ/zlib/crc32.h b/deps/SZ/zlib/crc32.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e0c7781025148380d130d6f7b6e590117ad3a8c
--- /dev/null
+++ b/deps/SZ/zlib/crc32.h
@@ -0,0 +1,441 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const z_crc_t FAR crc_table[TBLS][256] =
+{
+  {
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL
+#ifdef BYFOUR
+  },
+  {
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL
+  },
+  {
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL
+  },
+  {
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL
+  },
+  {
+    0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+    0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+    0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+    0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+    0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+    0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+    0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+    0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+    0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+    0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+    0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+    0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+    0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+    0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+    0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+    0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+    0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+    0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+    0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+    0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+    0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+    0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+    0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+    0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+    0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+    0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+    0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+    0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+    0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+    0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+    0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+    0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+    0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+    0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+    0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+    0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+    0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+    0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+    0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+    0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+    0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+    0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+    0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+    0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+    0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+    0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+    0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+    0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+    0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+    0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+    0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+    0x8def022dUL
+  },
+  {
+    0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+    0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+    0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+    0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+    0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+    0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+    0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+    0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+    0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+    0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+    0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+    0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+    0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+    0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+    0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+    0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+    0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+    0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+    0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+    0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+    0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+    0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+    0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+    0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+    0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+    0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+    0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+    0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+    0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+    0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+    0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+    0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+    0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+    0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+    0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+    0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+    0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+    0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+    0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+    0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+    0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+    0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+    0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+    0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+    0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+    0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+    0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+    0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+    0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+    0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+    0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+    0x72fd2493UL
+  },
+  {
+    0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+    0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+    0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+    0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+    0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+    0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+    0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+    0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+    0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+    0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+    0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+    0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+    0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+    0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+    0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+    0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+    0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+    0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+    0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+    0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+    0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+    0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+    0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+    0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+    0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+    0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+    0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+    0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+    0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+    0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+    0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+    0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+    0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+    0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+    0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+    0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+    0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+    0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+    0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+    0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+    0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+    0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+    0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+    0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+    0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+    0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+    0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+    0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+    0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+    0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+    0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+    0xed3498beUL
+  },
+  {
+    0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+    0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+    0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+    0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+    0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+    0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+    0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+    0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+    0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+    0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+    0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+    0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+    0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+    0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+    0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+    0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+    0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+    0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+    0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+    0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+    0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+    0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+    0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+    0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+    0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+    0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+    0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+    0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+    0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+    0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+    0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+    0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+    0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+    0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+    0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+    0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+    0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+    0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+    0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+    0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+    0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+    0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+    0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+    0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+    0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+    0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+    0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+    0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+    0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+    0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+    0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+    0xf10605deUL
+#endif
+  }
+};
diff --git a/deps/SZ/zlib/deflate.c b/deps/SZ/zlib/deflate.c
new file mode 100644
index 0000000000000000000000000000000000000000..1ec761448de926724c359256bbff0e8d9e851415
--- /dev/null
+++ b/deps/SZ/zlib/deflate.c
@@ -0,0 +1,2163 @@
+/* deflate.c -- compress data using the deflation algorithm
+ * Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process depends on being able to identify portions
+ *      of the input text which are identical to earlier input (within a
+ *      sliding window trailing behind the input currently being processed).
+ *
+ *      The most straightforward technique turns out to be the fastest for
+ *      most input files: try all possible matches and select the longest.
+ *      The key feature of this algorithm is that insertions into the string
+ *      dictionary are very simple and thus fast, and deletions are avoided
+ *      completely. Insertions are performed at each input character, whereas
+ *      string matches are performed only when the previous match ends. So it
+ *      is preferable to spend more time in matches to allow very fast string
+ *      insertions and avoid deletions. The matching algorithm for small
+ *      strings is inspired from that of Rabin & Karp. A brute force approach
+ *      is used to find longer strings when a small match has been found.
+ *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
+ *      (by Leonid Broukhis).
+ *         A previous version of this file used a more sophisticated algorithm
+ *      (by Fiala and Greene) which is guaranteed to run in linear amortized
+ *      time, but has a larger average cost, uses more memory and is patented.
+ *      However the F&G algorithm may be faster for some highly redundant
+ *      files if the parameter max_chain_length (described below) is too large.
+ *
+ *  ACKNOWLEDGEMENTS
+ *
+ *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
+ *      I found it in 'freeze' written by Leonid Broukhis.
+ *      Thanks to many people for bug reports and testing.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
+ *      Available in http://tools.ietf.org/html/rfc1951
+ *
+ *      A description of the Rabin and Karp algorithm is given in the book
+ *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
+ *
+ *      Fiala,E.R., and Greene,D.H.
+ *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
+ *
+ */
+
+/* @(#) $Id$ */
+
+#include "deflate.h"
+
+const char deflate_copyright[] =
+   " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
+/*
+  If you use the zlib library in a product, an acknowledgment is welcome
+  in the documentation of your product. If for some reason you cannot
+  include such an acknowledgment, I would appreciate that you keep this
+  copyright string in the executable of your product.
+ */
+
+/* ===========================================================================
+ *  Function prototypes.
+ */
+typedef enum {
+    need_more,      /* block not completed, need more input or more output */
+    block_done,     /* block flush performed */
+    finish_started, /* finish started, need only more output at next deflate */
+    finish_done     /* finish done, accept no more input or output */
+} block_state;
+
+typedef block_state (*compress_func) OF((deflate_state *s, int flush));
+/* Compression function. Returns the block state after the call. */
+
+local int deflateStateCheck      OF((z_streamp strm));
+local void slide_hash     OF((deflate_state *s));
+local void fill_window    OF((deflate_state *s));
+local block_state deflate_stored OF((deflate_state *s, int flush));
+local block_state deflate_fast   OF((deflate_state *s, int flush));
+#ifndef FASTEST
+local block_state deflate_slow   OF((deflate_state *s, int flush));
+#endif
+local block_state deflate_rle    OF((deflate_state *s, int flush));
+local block_state deflate_huff   OF((deflate_state *s, int flush));
+local void lm_init        OF((deflate_state *s));
+local void putShortMSB    OF((deflate_state *s, uInt b));
+local void flush_pending  OF((z_streamp strm));
+local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
+#ifdef ASMV
+#  pragma message("Assembler code may have bugs -- use at your own risk")
+      void match_init OF((void)); /* asm code initialization */
+      uInt longest_match  OF((deflate_state *s, IPos cur_match));
+#else
+local uInt longest_match  OF((deflate_state *s, IPos cur_match));
+#endif
+
+#ifdef ZLIB_DEBUG
+local  void check_match OF((deflate_state *s, IPos start, IPos match,
+                            int length));
+#endif
+
+/* ===========================================================================
+ * Local data
+ */
+
+#define NIL 0
+/* Tail of hash chains */
+
+#ifndef TOO_FAR
+#  define TOO_FAR 4096
+#endif
+/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
+
+/* Values for max_lazy_match, good_match and max_chain_length, depending on
+ * the desired pack level (0..9). The values given below have been tuned to
+ * exclude worst case performance for pathological files. Better values may be
+ * found for specific files.
+ */
+typedef struct config_s {
+   ush good_length; /* reduce lazy search above this match length */
+   ush max_lazy;    /* do not perform lazy search above this match length */
+   ush nice_length; /* quit search above this match length */
+   ush max_chain;
+   compress_func func;
+} config;
+
+#ifdef FASTEST
+local const config configuration_table[2] = {
+/*      good lazy nice chain */
+/* 0 */ {0,    0,  0,    0, deflate_stored},  /* store only */
+/* 1 */ {4,    4,  8,    4, deflate_fast}}; /* max speed, no lazy matches */
+#else
+local const config configuration_table[10] = {
+/*      good lazy nice chain */
+/* 0 */ {0,    0,  0,    0, deflate_stored},  /* store only */
+/* 1 */ {4,    4,  8,    4, deflate_fast}, /* max speed, no lazy matches */
+/* 2 */ {4,    5, 16,    8, deflate_fast},
+/* 3 */ {4,    6, 32,   32, deflate_fast},
+
+/* 4 */ {4,    4, 16,   16, deflate_slow},  /* lazy matches */
+/* 5 */ {8,   16, 32,   32, deflate_slow},
+/* 6 */ {8,   16, 128, 128, deflate_slow},
+/* 7 */ {8,   32, 128, 256, deflate_slow},
+/* 8 */ {32, 128, 258, 1024, deflate_slow},
+/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */
+#endif
+
+/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
+ * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
+ * meaning.
+ */
+
+/* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
+#define RANK(f) (((f) * 2) - ((f) > 4 ? 9 : 0))
+
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN  assertion: all calls to UPDATE_HASH are made with consecutive input
+ *    characters, so that a running hash key can be computed from the previous
+ *    key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
+
+
+/* ===========================================================================
+ * Insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ * If this file is compiled with -DFASTEST, the compression level is forced
+ * to 1, and no hash chains are maintained.
+ * IN  assertion: all calls to INSERT_STRING are made with consecutive input
+ *    characters and the first MIN_MATCH bytes of str are valid (except for
+ *    the last MIN_MATCH-1 bytes of the input file).
+ */
+#ifdef FASTEST
+#define INSERT_STRING(s, str, match_head) \
+   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+    match_head = s->head[s->ins_h], \
+    s->head[s->ins_h] = (Pos)(str))
+#else
+#define INSERT_STRING(s, str, match_head) \
+   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+    match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
+    s->head[s->ins_h] = (Pos)(str))
+#endif
+
+/* ===========================================================================
+ * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
+ * prev[] will be initialized on the fly.
+ */
+#define CLEAR_HASH(s) \
+    s->head[s->hash_size-1] = NIL; \
+    zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+local void slide_hash(s)
+    deflate_state *s;
+{
+    unsigned n, m;
+    Posf *p;
+    uInt wsize = s->w_size;
+
+    n = s->hash_size;
+    p = &s->head[n];
+    do {
+        m = *--p;
+        *p = (Pos)(m >= wsize ? m - wsize : NIL);
+    } while (--n);
+    n = wsize;
+#ifndef FASTEST
+    p = &s->prev[n];
+    do {
+        m = *--p;
+        *p = (Pos)(m >= wsize ? m - wsize : NIL);
+        /* If n is not on any hash chain, prev[n] is garbage but
+         * its value will never be used.
+         */
+    } while (--n);
+#endif
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateInit_(strm, level, version, stream_size)
+    z_streamp strm;
+    int level;
+    const char *version;
+    int stream_size;
+{
+    return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY, version, stream_size);
+    /* To do: ignore strm->next_in if we use it as window */
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
+                  version, stream_size)
+    z_streamp strm;
+    int  level;
+    int  method;
+    int  windowBits;
+    int  memLevel;
+    int  strategy;
+    const char *version;
+    int stream_size;
+{
+    deflate_state *s;
+    int wrap = 1;
+    static const char my_version[] = ZLIB_VERSION;
+
+    ushf *overlay;
+    /* We overlay pending_buf and d_buf+l_buf. This works since the average
+     * output size for (length,distance) codes is <= 24 bits.
+     */
+
+    if (version == Z_NULL || version[0] != my_version[0] ||
+        stream_size != sizeof(z_stream)) {
+        return Z_VERSION_ERROR;
+    }
+    if (strm == Z_NULL) return Z_STREAM_ERROR;
+
+    strm->msg = Z_NULL;
+    if (strm->zalloc == (alloc_func)0) {
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zalloc = zcalloc;
+        strm->opaque = (voidpf)0;
+#endif
+    }
+    if (strm->zfree == (free_func)0)
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zfree = zcfree;
+#endif
+
+#ifdef FASTEST
+    if (level != 0) level = 1;
+#else
+    if (level == Z_DEFAULT_COMPRESSION) level = 6;
+#endif
+
+    if (windowBits < 0) { /* suppress zlib wrapper */
+        wrap = 0;
+        windowBits = -windowBits;
+    }
+#ifdef GZIP
+    else if (windowBits > 15) {
+        wrap = 2;       /* write gzip wrapper instead */
+        windowBits -= 16;
+    }
+#endif
+    if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED ||
+        windowBits < 8 || windowBits > 15 || level < 0 || level > 9 ||
+        strategy < 0 || strategy > Z_FIXED || (windowBits == 8 && wrap != 1)) {
+        return Z_STREAM_ERROR;
+    }
+    if (windowBits == 8) windowBits = 9;  /* until 256-byte window bug fixed */
+    s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state));
+    if (s == Z_NULL) return Z_MEM_ERROR;
+    strm->state = (struct internal_state FAR *)s;
+    s->strm = strm;
+    s->status = INIT_STATE;     /* to pass state test in deflateReset() */
+
+    s->wrap = wrap;
+    s->gzhead = Z_NULL;
+    s->w_bits = (uInt)windowBits;
+    s->w_size = 1 << s->w_bits;
+    s->w_mask = s->w_size - 1;
+
+    s->hash_bits = (uInt)memLevel + 7;
+    s->hash_size = 1 << s->hash_bits;
+    s->hash_mask = s->hash_size - 1;
+    s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
+
+    s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
+    s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
+    s->head   = (Posf *)  ZALLOC(strm, s->hash_size, sizeof(Pos));
+
+    s->high_water = 0;      /* nothing written to s->window yet */
+
+    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+
+    overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2);
+    s->pending_buf = (uchf *) overlay;
+    s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L);
+
+    if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
+        s->pending_buf == Z_NULL) {
+        s->status = FINISH_STATE;
+        strm->msg = ERR_MSG(Z_MEM_ERROR);
+        deflateEnd (strm);
+        return Z_MEM_ERROR;
+    }
+    s->d_buf = overlay + s->lit_bufsize/sizeof(ush);
+    s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize;
+
+    s->level = level;
+    s->strategy = strategy;
+    s->method = (Byte)method;
+
+    return deflateReset(strm);
+}
+
+/* =========================================================================
+ * Check for a valid deflate stream state. Return 0 if ok, 1 if not.
+ */
+local int deflateStateCheck (strm)
+    z_streamp strm;
+{
+    deflate_state *s;
+    if (strm == Z_NULL ||
+        strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
+        return 1;
+    s = strm->state;
+    if (s == Z_NULL || s->strm != strm || (s->status != INIT_STATE &&
+#ifdef GZIP
+                                           s->status != GZIP_STATE &&
+#endif
+                                           s->status != EXTRA_STATE &&
+                                           s->status != NAME_STATE &&
+                                           s->status != COMMENT_STATE &&
+                                           s->status != HCRC_STATE &&
+                                           s->status != BUSY_STATE &&
+                                           s->status != FINISH_STATE))
+        return 1;
+    return 0;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
+    z_streamp strm;
+    const Bytef *dictionary;
+    uInt  dictLength;
+{
+    deflate_state *s;
+    uInt str, n;
+    int wrap;
+    unsigned avail;
+    z_const unsigned char *next;
+
+    if (deflateStateCheck(strm) || dictionary == Z_NULL)
+        return Z_STREAM_ERROR;
+    s = strm->state;
+    wrap = s->wrap;
+    if (wrap == 2 || (wrap == 1 && s->status != INIT_STATE) || s->lookahead)
+        return Z_STREAM_ERROR;
+
+    /* when using zlib wrappers, compute Adler-32 for provided dictionary */
+    if (wrap == 1)
+        strm->adler = adler32(strm->adler, dictionary, dictLength);
+    s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */
+
+    /* if dictionary would fill window, just replace the history */
+    if (dictLength >= s->w_size) {
+        if (wrap == 0) {            /* already empty otherwise */
+            CLEAR_HASH(s);
+            s->strstart = 0;
+            s->block_start = 0L;
+            s->insert = 0;
+        }
+        dictionary += dictLength - s->w_size;  /* use the tail */
+        dictLength = s->w_size;
+    }
+
+    /* insert dictionary into window and hash */
+    avail = strm->avail_in;
+    next = strm->next_in;
+    strm->avail_in = dictLength;
+    strm->next_in = (z_const Bytef *)dictionary;
+    fill_window(s);
+    while (s->lookahead >= MIN_MATCH) {
+        str = s->strstart;
+        n = s->lookahead - (MIN_MATCH-1);
+        do {
+            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+#ifndef FASTEST
+            s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+            s->head[s->ins_h] = (Pos)str;
+            str++;
+        } while (--n);
+        s->strstart = str;
+        s->lookahead = MIN_MATCH-1;
+        fill_window(s);
+    }
+    s->strstart += s->lookahead;
+    s->block_start = (long)s->strstart;
+    s->insert = s->lookahead;
+    s->lookahead = 0;
+    s->match_length = s->prev_length = MIN_MATCH-1;
+    s->match_available = 0;
+    strm->next_in = next;
+    strm->avail_in = avail;
+    s->wrap = wrap;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
+    z_streamp strm;
+    Bytef *dictionary;
+    uInt  *dictLength;
+{
+    deflate_state *s;
+    uInt len;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+    len = s->strstart + s->lookahead;
+    if (len > s->w_size)
+        len = s->w_size;
+    if (dictionary != Z_NULL && len)
+        zmemcpy(dictionary, s->window + s->strstart + s->lookahead - len, len);
+    if (dictLength != Z_NULL)
+        *dictLength = len;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateResetKeep (strm)
+    z_streamp strm;
+{
+    deflate_state *s;
+
+    if (deflateStateCheck(strm)) {
+        return Z_STREAM_ERROR;
+    }
+
+    strm->total_in = strm->total_out = 0;
+    strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */
+    strm->data_type = Z_UNKNOWN;
+
+    s = (deflate_state *)strm->state;
+    s->pending = 0;
+    s->pending_out = s->pending_buf;
+
+    if (s->wrap < 0) {
+        s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */
+    }
+    s->status =
+#ifdef GZIP
+        s->wrap == 2 ? GZIP_STATE :
+#endif
+        s->wrap ? INIT_STATE : BUSY_STATE;
+    strm->adler =
+#ifdef GZIP
+        s->wrap == 2 ? crc32(0L, Z_NULL, 0) :
+#endif
+        adler32(0L, Z_NULL, 0);
+    s->last_flush = Z_NO_FLUSH;
+
+    _tr_init(s);
+
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateReset (strm)
+    z_streamp strm;
+{
+    int ret;
+
+    ret = deflateResetKeep(strm);
+    if (ret == Z_OK)
+        lm_init(strm->state);
+    return ret;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateSetHeader (strm, head)
+    z_streamp strm;
+    gz_headerp head;
+{
+    if (deflateStateCheck(strm) || strm->state->wrap != 2)
+        return Z_STREAM_ERROR;
+    strm->state->gzhead = head;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflatePending (strm, pending, bits)
+    unsigned *pending;
+    int *bits;
+    z_streamp strm;
+{
+    if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
+    if (pending != Z_NULL)
+        *pending = strm->state->pending;
+    if (bits != Z_NULL)
+        *bits = strm->state->bi_valid;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflatePrime (strm, bits, value)
+    z_streamp strm;
+    int bits;
+    int value;
+{
+    deflate_state *s;
+    int put;
+
+    if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
+    s = strm->state;
+    if ((Bytef *)(s->d_buf) < s->pending_out + ((Buf_size + 7) >> 3))
+        return Z_BUF_ERROR;
+    do {
+        put = Buf_size - s->bi_valid;
+        if (put > bits)
+            put = bits;
+        s->bi_buf |= (ush)((value & ((1 << put) - 1)) << s->bi_valid);
+        s->bi_valid += put;
+        _tr_flush_bits(s);
+        value >>= put;
+        bits -= put;
+    } while (bits);
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateParams(strm, level, strategy)
+    z_streamp strm;
+    int level;
+    int strategy;
+{
+    deflate_state *s;
+    compress_func func;
+
+    if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
+    s = strm->state;
+
+#ifdef FASTEST
+    if (level != 0) level = 1;
+#else
+    if (level == Z_DEFAULT_COMPRESSION) level = 6;
+#endif
+    if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) {
+        return Z_STREAM_ERROR;
+    }
+    func = configuration_table[s->level].func;
+
+    if ((strategy != s->strategy || func != configuration_table[level].func) &&
+        s->high_water) {
+        /* Flush the last buffer: */
+        int err = deflate(strm, Z_BLOCK);
+        if (err == Z_STREAM_ERROR)
+            return err;
+        if (strm->avail_out == 0)
+            return Z_BUF_ERROR;
+    }
+    if (s->level != level) {
+        if (s->level == 0 && s->matches != 0) {
+            if (s->matches == 1)
+                slide_hash(s);
+            else
+                CLEAR_HASH(s);
+            s->matches = 0;
+        }
+        s->level = level;
+        s->max_lazy_match   = configuration_table[level].max_lazy;
+        s->good_match       = configuration_table[level].good_length;
+        s->nice_match       = configuration_table[level].nice_length;
+        s->max_chain_length = configuration_table[level].max_chain;
+    }
+    s->strategy = strategy;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
+    z_streamp strm;
+    int good_length;
+    int max_lazy;
+    int nice_length;
+    int max_chain;
+{
+    deflate_state *s;
+
+    if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
+    s = strm->state;
+    s->good_match = (uInt)good_length;
+    s->max_lazy_match = (uInt)max_lazy;
+    s->nice_match = nice_length;
+    s->max_chain_length = (uInt)max_chain;
+    return Z_OK;
+}
+
+/* =========================================================================
+ * For the default windowBits of 15 and memLevel of 8, this function returns
+ * a close to exact, as well as small, upper bound on the compressed size.
+ * They are coded as constants here for a reason--if the #define's are
+ * changed, then this function needs to be changed as well.  The return
+ * value for 15 and 8 only works for those exact settings.
+ *
+ * For any setting other than those defaults for windowBits and memLevel,
+ * the value returned is a conservative worst case for the maximum expansion
+ * resulting from using fixed blocks instead of stored blocks, which deflate
+ * can emit on compressed data for some combinations of the parameters.
+ *
+ * This function could be more sophisticated to provide closer upper bounds for
+ * every combination of windowBits and memLevel.  But even the conservative
+ * upper bound of about 14% expansion does not seem onerous for output buffer
+ * allocation.
+ */
+uLong ZEXPORT deflateBound(strm, sourceLen)
+    z_streamp strm;
+    uLong sourceLen;
+{
+    deflate_state *s;
+    uLong complen, wraplen;
+
+    /* conservative upper bound for compressed data */
+    complen = sourceLen +
+              ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
+
+    /* if can't get parameters, return conservative bound plus zlib wrapper */
+    if (deflateStateCheck(strm))
+        return complen + 6;
+
+    /* compute wrapper length */
+    s = strm->state;
+    switch (s->wrap) {
+    case 0:                                 /* raw deflate */
+        wraplen = 0;
+        break;
+    case 1:                                 /* zlib wrapper */
+        wraplen = 6 + (s->strstart ? 4 : 0);
+        break;
+#ifdef GZIP
+    case 2:                                 /* gzip wrapper */
+        wraplen = 18;
+        if (s->gzhead != Z_NULL) {          /* user-supplied gzip header */
+            Bytef *str;
+            if (s->gzhead->extra != Z_NULL)
+                wraplen += 2 + s->gzhead->extra_len;
+            str = s->gzhead->name;
+            if (str != Z_NULL)
+                do {
+                    wraplen++;
+                } while (*str++);
+            str = s->gzhead->comment;
+            if (str != Z_NULL)
+                do {
+                    wraplen++;
+                } while (*str++);
+            if (s->gzhead->hcrc)
+                wraplen += 2;
+        }
+        break;
+#endif
+    default:                                /* for compiler happiness */
+        wraplen = 6;
+    }
+
+    /* if not default parameters, return conservative bound */
+    if (s->w_bits != 15 || s->hash_bits != 8 + 7)
+        return complen + wraplen;
+
+    /* default settings: return tight bound for that case */
+    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
+           (sourceLen >> 25) + 13 - 6 + wraplen;
+}
+
+/* =========================================================================
+ * Put a short in the pending buffer. The 16-bit value is put in MSB order.
+ * IN assertion: the stream state is correct and there is enough room in
+ * pending_buf.
+ */
+local void putShortMSB (s, b)
+    deflate_state *s;
+    uInt b;
+{
+    put_byte(s, (Byte)(b >> 8));
+    put_byte(s, (Byte)(b & 0xff));
+}
+
+/* =========================================================================
+ * Flush as much pending output as possible. All deflate() output, except for
+ * some deflate_stored() output, goes through this function so some
+ * applications may wish to modify it to avoid allocating a large
+ * strm->next_out buffer and copying into it. (See also read_buf()).
+ */
+local void flush_pending(strm)
+    z_streamp strm;
+{
+    unsigned len;
+    deflate_state *s = strm->state;
+
+    _tr_flush_bits(s);
+    len = s->pending;
+    if (len > strm->avail_out) len = strm->avail_out;
+    if (len == 0) return;
+
+    zmemcpy(strm->next_out, s->pending_out, len);
+    strm->next_out  += len;
+    s->pending_out  += len;
+    strm->total_out += len;
+    strm->avail_out -= len;
+    s->pending      -= len;
+    if (s->pending == 0) {
+        s->pending_out = s->pending_buf;
+    }
+}
+
+/* ===========================================================================
+ * Update the header CRC with the bytes s->pending_buf[beg..s->pending - 1].
+ */
+#define HCRC_UPDATE(beg) \
+    do { \
+        if (s->gzhead->hcrc && s->pending > (beg)) \
+            strm->adler = crc32(strm->adler, s->pending_buf + (beg), \
+                                s->pending - (beg)); \
+    } while (0)
+
+/* ========================================================================= */
+int ZEXPORT deflate (strm, flush)
+    z_streamp strm;
+    int flush;
+{
+    int old_flush; /* value of flush param for previous deflate call */
+    deflate_state *s;
+
+    if (deflateStateCheck(strm) || flush > Z_BLOCK || flush < 0) {
+        return Z_STREAM_ERROR;
+    }
+    s = strm->state;
+
+    if (strm->next_out == Z_NULL ||
+        (strm->avail_in != 0 && strm->next_in == Z_NULL) ||
+        (s->status == FINISH_STATE && flush != Z_FINISH)) {
+        ERR_RETURN(strm, Z_STREAM_ERROR);
+    }
+    if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR);
+
+    old_flush = s->last_flush;
+    s->last_flush = flush;
+
+    /* Flush as much pending output as possible */
+    if (s->pending != 0) {
+        flush_pending(strm);
+        if (strm->avail_out == 0) {
+            /* Since avail_out is 0, deflate will be called again with
+             * more output space, but possibly with both pending and
+             * avail_in equal to zero. There won't be anything to do,
+             * but this is not an error situation so make sure we
+             * return OK instead of BUF_ERROR at next call of deflate:
+             */
+            s->last_flush = -1;
+            return Z_OK;
+        }
+
+    /* Make sure there is something to do and avoid duplicate consecutive
+     * flushes. For repeated and useless calls with Z_FINISH, we keep
+     * returning Z_STREAM_END instead of Z_BUF_ERROR.
+     */
+    } else if (strm->avail_in == 0 && RANK(flush) <= RANK(old_flush) &&
+               flush != Z_FINISH) {
+        ERR_RETURN(strm, Z_BUF_ERROR);
+    }
+
+    /* User must not provide more input after the first FINISH: */
+    if (s->status == FINISH_STATE && strm->avail_in != 0) {
+        ERR_RETURN(strm, Z_BUF_ERROR);
+    }
+
+    /* Write the header */
+    if (s->status == INIT_STATE) {
+        /* zlib header */
+        uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
+        uInt level_flags;
+
+        if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
+            level_flags = 0;
+        else if (s->level < 6)
+            level_flags = 1;
+        else if (s->level == 6)
+            level_flags = 2;
+        else
+            level_flags = 3;
+        header |= (level_flags << 6);
+        if (s->strstart != 0) header |= PRESET_DICT;
+        header += 31 - (header % 31);
+
+        putShortMSB(s, header);
+
+        /* Save the adler32 of the preset dictionary: */
+        if (s->strstart != 0) {
+            putShortMSB(s, (uInt)(strm->adler >> 16));
+            putShortMSB(s, (uInt)(strm->adler & 0xffff));
+        }
+        strm->adler = adler32(0L, Z_NULL, 0);
+        s->status = BUSY_STATE;
+
+        /* Compression must start with an empty pending buffer */
+        flush_pending(strm);
+        if (s->pending != 0) {
+            s->last_flush = -1;
+            return Z_OK;
+        }
+    }
+#ifdef GZIP
+    if (s->status == GZIP_STATE) {
+        /* gzip header */
+        strm->adler = crc32(0L, Z_NULL, 0);
+        put_byte(s, 31);
+        put_byte(s, 139);
+        put_byte(s, 8);
+        if (s->gzhead == Z_NULL) {
+            put_byte(s, 0);
+            put_byte(s, 0);
+            put_byte(s, 0);
+            put_byte(s, 0);
+            put_byte(s, 0);
+            put_byte(s, s->level == 9 ? 2 :
+                     (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
+                      4 : 0));
+            put_byte(s, OS_CODE);
+            s->status = BUSY_STATE;
+
+            /* Compression must start with an empty pending buffer */
+            flush_pending(strm);
+            if (s->pending != 0) {
+                s->last_flush = -1;
+                return Z_OK;
+            }
+        }
+        else {
+            put_byte(s, (s->gzhead->text ? 1 : 0) +
+                     (s->gzhead->hcrc ? 2 : 0) +
+                     (s->gzhead->extra == Z_NULL ? 0 : 4) +
+                     (s->gzhead->name == Z_NULL ? 0 : 8) +
+                     (s->gzhead->comment == Z_NULL ? 0 : 16)
+                     );
+            put_byte(s, (Byte)(s->gzhead->time & 0xff));
+            put_byte(s, (Byte)((s->gzhead->time >> 8) & 0xff));
+            put_byte(s, (Byte)((s->gzhead->time >> 16) & 0xff));
+            put_byte(s, (Byte)((s->gzhead->time >> 24) & 0xff));
+            put_byte(s, s->level == 9 ? 2 :
+                     (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
+                      4 : 0));
+            put_byte(s, s->gzhead->os & 0xff);
+            if (s->gzhead->extra != Z_NULL) {
+                put_byte(s, s->gzhead->extra_len & 0xff);
+                put_byte(s, (s->gzhead->extra_len >> 8) & 0xff);
+            }
+            if (s->gzhead->hcrc)
+                strm->adler = crc32(strm->adler, s->pending_buf,
+                                    s->pending);
+            s->gzindex = 0;
+            s->status = EXTRA_STATE;
+        }
+    }
+    if (s->status == EXTRA_STATE) {
+        if (s->gzhead->extra != Z_NULL) {
+            ulg beg = s->pending;   /* start of bytes to update crc */
+            uInt left = (s->gzhead->extra_len & 0xffff) - s->gzindex;
+            while (s->pending + left > s->pending_buf_size) {
+                uInt copy = s->pending_buf_size - s->pending;
+                zmemcpy(s->pending_buf + s->pending,
+                        s->gzhead->extra + s->gzindex, copy);
+                s->pending = s->pending_buf_size;
+                HCRC_UPDATE(beg);
+                s->gzindex += copy;
+                flush_pending(strm);
+                if (s->pending != 0) {
+                    s->last_flush = -1;
+                    return Z_OK;
+                }
+                beg = 0;
+                left -= copy;
+            }
+            zmemcpy(s->pending_buf + s->pending,
+                    s->gzhead->extra + s->gzindex, left);
+            s->pending += left;
+            HCRC_UPDATE(beg);
+            s->gzindex = 0;
+        }
+        s->status = NAME_STATE;
+    }
+    if (s->status == NAME_STATE) {
+        if (s->gzhead->name != Z_NULL) {
+            ulg beg = s->pending;   /* start of bytes to update crc */
+            int val;
+            do {
+                if (s->pending == s->pending_buf_size) {
+                    HCRC_UPDATE(beg);
+                    flush_pending(strm);
+                    if (s->pending != 0) {
+                        s->last_flush = -1;
+                        return Z_OK;
+                    }
+                    beg = 0;
+                }
+                val = s->gzhead->name[s->gzindex++];
+                put_byte(s, val);
+            } while (val != 0);
+            HCRC_UPDATE(beg);
+            s->gzindex = 0;
+        }
+        s->status = COMMENT_STATE;
+    }
+    if (s->status == COMMENT_STATE) {
+        if (s->gzhead->comment != Z_NULL) {
+            ulg beg = s->pending;   /* start of bytes to update crc */
+            int val;
+            do {
+                if (s->pending == s->pending_buf_size) {
+                    HCRC_UPDATE(beg);
+                    flush_pending(strm);
+                    if (s->pending != 0) {
+                        s->last_flush = -1;
+                        return Z_OK;
+                    }
+                    beg = 0;
+                }
+                val = s->gzhead->comment[s->gzindex++];
+                put_byte(s, val);
+            } while (val != 0);
+            HCRC_UPDATE(beg);
+        }
+        s->status = HCRC_STATE;
+    }
+    if (s->status == HCRC_STATE) {
+        if (s->gzhead->hcrc) {
+            if (s->pending + 2 > s->pending_buf_size) {
+                flush_pending(strm);
+                if (s->pending != 0) {
+                    s->last_flush = -1;
+                    return Z_OK;
+                }
+            }
+            put_byte(s, (Byte)(strm->adler & 0xff));
+            put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
+            strm->adler = crc32(0L, Z_NULL, 0);
+        }
+        s->status = BUSY_STATE;
+
+        /* Compression must start with an empty pending buffer */
+        flush_pending(strm);
+        if (s->pending != 0) {
+            s->last_flush = -1;
+            return Z_OK;
+        }
+    }
+#endif
+
+    /* Start a new block or continue the current one.
+     */
+    if (strm->avail_in != 0 || s->lookahead != 0 ||
+        (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
+        block_state bstate;
+
+        bstate = s->level == 0 ? deflate_stored(s, flush) :
+                 s->strategy == Z_HUFFMAN_ONLY ? deflate_huff(s, flush) :
+                 s->strategy == Z_RLE ? deflate_rle(s, flush) :
+                 (*(configuration_table[s->level].func))(s, flush);
+
+        if (bstate == finish_started || bstate == finish_done) {
+            s->status = FINISH_STATE;
+        }
+        if (bstate == need_more || bstate == finish_started) {
+            if (strm->avail_out == 0) {
+                s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
+            }
+            return Z_OK;
+            /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
+             * of deflate should use the same flush parameter to make sure
+             * that the flush is complete. So we don't have to output an
+             * empty block here, this will be done at next call. This also
+             * ensures that for a very small output buffer, we emit at most
+             * one empty block.
+             */
+        }
+        if (bstate == block_done) {
+            if (flush == Z_PARTIAL_FLUSH) {
+                _tr_align(s);
+            } else if (flush != Z_BLOCK) { /* FULL_FLUSH or SYNC_FLUSH */
+                _tr_stored_block(s, (char*)0, 0L, 0);
+                /* For a full flush, this empty block will be recognized
+                 * as a special marker by inflate_sync().
+                 */
+                if (flush == Z_FULL_FLUSH) {
+                    CLEAR_HASH(s);             /* forget history */
+                    if (s->lookahead == 0) {
+                        s->strstart = 0;
+                        s->block_start = 0L;
+                        s->insert = 0;
+                    }
+                }
+            }
+            flush_pending(strm);
+            if (strm->avail_out == 0) {
+              s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
+              return Z_OK;
+            }
+        }
+    }
+
+    if (flush != Z_FINISH) return Z_OK;
+    if (s->wrap <= 0) return Z_STREAM_END;
+
+    /* Write the trailer */
+#ifdef GZIP
+    if (s->wrap == 2) {
+        put_byte(s, (Byte)(strm->adler & 0xff));
+        put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
+        put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
+        put_byte(s, (Byte)((strm->adler >> 24) & 0xff));
+        put_byte(s, (Byte)(strm->total_in & 0xff));
+        put_byte(s, (Byte)((strm->total_in >> 8) & 0xff));
+        put_byte(s, (Byte)((strm->total_in >> 16) & 0xff));
+        put_byte(s, (Byte)((strm->total_in >> 24) & 0xff));
+    }
+    else
+#endif
+    {
+        putShortMSB(s, (uInt)(strm->adler >> 16));
+        putShortMSB(s, (uInt)(strm->adler & 0xffff));
+    }
+    flush_pending(strm);
+    /* If avail_out is zero, the application will call deflate again
+     * to flush the rest.
+     */
+    if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */
+    return s->pending != 0 ? Z_OK : Z_STREAM_END;
+}
+
+/* ========================================================================= */
+int ZEXPORT deflateEnd (strm)
+    z_streamp strm;
+{
+    int status;
+
+    if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
+
+    status = strm->state->status;
+
+    /* Deallocate in reverse order of allocations: */
+    TRY_FREE(strm, strm->state->pending_buf);
+    TRY_FREE(strm, strm->state->head);
+    TRY_FREE(strm, strm->state->prev);
+    TRY_FREE(strm, strm->state->window);
+
+    ZFREE(strm, strm->state);
+    strm->state = Z_NULL;
+
+    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
+}
+
+/* =========================================================================
+ * Copy the source state to the destination state.
+ * To simplify the source, this is not supported for 16-bit MSDOS (which
+ * doesn't have enough memory anyway to duplicate compression states).
+ */
+int ZEXPORT deflateCopy (dest, source)
+    z_streamp dest;
+    z_streamp source;
+{
+#ifdef MAXSEG_64K
+    return Z_STREAM_ERROR;
+#else
+    deflate_state *ds;
+    deflate_state *ss;
+    ushf *overlay;
+
+
+    if (deflateStateCheck(source) || dest == Z_NULL) {
+        return Z_STREAM_ERROR;
+    }
+
+    ss = source->state;
+
+    zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream));
+
+    ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state));
+    if (ds == Z_NULL) return Z_MEM_ERROR;
+    dest->state = (struct internal_state FAR *) ds;
+    zmemcpy((voidpf)ds, (voidpf)ss, sizeof(deflate_state));
+    ds->strm = dest;
+
+    ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
+    ds->prev   = (Posf *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
+    ds->head   = (Posf *)  ZALLOC(dest, ds->hash_size, sizeof(Pos));
+    overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2);
+    ds->pending_buf = (uchf *) overlay;
+
+    if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL ||
+        ds->pending_buf == Z_NULL) {
+        deflateEnd (dest);
+        return Z_MEM_ERROR;
+    }
+    /* following zmemcpy do not work for 16-bit MSDOS */
+    zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
+    zmemcpy((voidpf)ds->prev, (voidpf)ss->prev, ds->w_size * sizeof(Pos));
+    zmemcpy((voidpf)ds->head, (voidpf)ss->head, ds->hash_size * sizeof(Pos));
+    zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
+
+    ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+    ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush);
+    ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize;
+
+    ds->l_desc.dyn_tree = ds->dyn_ltree;
+    ds->d_desc.dyn_tree = ds->dyn_dtree;
+    ds->bl_desc.dyn_tree = ds->bl_tree;
+
+    return Z_OK;
+#endif /* MAXSEG_64K */
+}
+
+/* ===========================================================================
+ * Read a new buffer from the current input stream, update the adler32
+ * and total number of bytes read.  All deflate() input goes through
+ * this function so some applications may wish to modify it to avoid
+ * allocating a large strm->next_in buffer and copying from it.
+ * (See also flush_pending()).
+ */
+local unsigned read_buf(strm, buf, size)
+    z_streamp strm;
+    Bytef *buf;
+    unsigned size;
+{
+    unsigned len = strm->avail_in;
+
+    if (len > size) len = size;
+    if (len == 0) return 0;
+
+    strm->avail_in  -= len;
+
+    zmemcpy(buf, strm->next_in, len);
+    if (strm->state->wrap == 1) {
+        strm->adler = adler32(strm->adler, buf, len);
+    }
+#ifdef GZIP
+    else if (strm->state->wrap == 2) {
+        strm->adler = crc32(strm->adler, buf, len);
+    }
+#endif
+    strm->next_in  += len;
+    strm->total_in += len;
+
+    return len;
+}
+
+/* ===========================================================================
+ * Initialize the "longest match" routines for a new zlib stream
+ */
+local void lm_init (s)
+    deflate_state *s;
+{
+    s->window_size = (ulg)2L*s->w_size;
+
+    CLEAR_HASH(s);
+
+    /* Set the default configuration parameters:
+     */
+    s->max_lazy_match   = configuration_table[s->level].max_lazy;
+    s->good_match       = configuration_table[s->level].good_length;
+    s->nice_match       = configuration_table[s->level].nice_length;
+    s->max_chain_length = configuration_table[s->level].max_chain;
+
+    s->strstart = 0;
+    s->block_start = 0L;
+    s->lookahead = 0;
+    s->insert = 0;
+    s->match_length = s->prev_length = MIN_MATCH-1;
+    s->match_available = 0;
+    s->ins_h = 0;
+#ifndef FASTEST
+#ifdef ASMV
+    match_init(); /* initialize the asm code */
+#endif
+#endif
+}
+
+#ifndef FASTEST
+/* ===========================================================================
+ * Set match_start to the longest match starting at the given string and
+ * return its length. Matches shorter or equal to prev_length are discarded,
+ * in which case the result is equal to prev_length and match_start is
+ * garbage.
+ * IN assertions: cur_match is the head of the hash chain for the current
+ *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
+ * OUT assertion: the match length is not greater than s->lookahead.
+ */
+#ifndef ASMV
+/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
+ * match.S. The code will be functionally equivalent.
+ */
+local uInt longest_match(s, cur_match)
+    deflate_state *s;
+    IPos cur_match;                             /* current match */
+{
+    unsigned chain_length = s->max_chain_length;/* max hash chain length */
+    register Bytef *scan = s->window + s->strstart; /* current string */
+    register Bytef *match;                      /* matched string */
+    register int len;                           /* length of current match */
+    int best_len = (int)s->prev_length;         /* best match length so far */
+    int nice_match = s->nice_match;             /* stop if match long enough */
+    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+        s->strstart - (IPos)MAX_DIST(s) : NIL;
+    /* Stop when cur_match becomes <= limit. To simplify the code,
+     * we prevent matches with the string of window index 0.
+     */
+    Posf *prev = s->prev;
+    uInt wmask = s->w_mask;
+
+#ifdef UNALIGNED_OK
+    /* Compare two bytes at a time. Note: this is not always beneficial.
+     * Try with and without -DUNALIGNED_OK to check.
+     */
+    register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
+    register ush scan_start = *(ushf*)scan;
+    register ush scan_end   = *(ushf*)(scan+best_len-1);
+#else
+    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+    register Byte scan_end1  = scan[best_len-1];
+    register Byte scan_end   = scan[best_len];
+#endif
+
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+     * It is easy to get rid of this optimization if necessary.
+     */
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+    /* Do not waste too much time if we already have a good match: */
+    if (s->prev_length >= s->good_match) {
+        chain_length >>= 2;
+    }
+    /* Do not look for matches beyond the end of the input. This is necessary
+     * to make deflate deterministic.
+     */
+    if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
+
+    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+    do {
+        Assert(cur_match < s->strstart, "no future");
+        match = s->window + cur_match;
+
+        /* Skip to next match if the match length cannot increase
+         * or if the match length is less than 2.  Note that the checks below
+         * for insufficient lookahead only occur occasionally for performance
+         * reasons.  Therefore uninitialized memory will be accessed, and
+         * conditional jumps will be made that depend on those values.
+         * However the length of the match is limited to the lookahead, so
+         * the output of deflate is not affected by the uninitialized values.
+         */
+#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+        /* This code assumes sizeof(unsigned short) == 2. Do not use
+         * UNALIGNED_OK if your compiler uses a different size.
+         */
+        if (*(ushf*)(match+best_len-1) != scan_end ||
+            *(ushf*)match != scan_start) continue;
+
+        /* It is not necessary to compare scan[2] and match[2] since they are
+         * always equal when the other bytes match, given that the hash keys
+         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+         * strstart+3, +5, ... up to strstart+257. We check for insufficient
+         * lookahead only every 4th comparison; the 128th check will be made
+         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+         * necessary to put more guard bytes at the end of the window, or
+         * to check more often for insufficient lookahead.
+         */
+        Assert(scan[2] == match[2], "scan[2]?");
+        scan++, match++;
+        do {
+        } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+                 scan < strend);
+        /* The funny "do {}" generates better code on most compilers */
+
+        /* Here, scan <= window+strstart+257 */
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+        if (*scan == *match) scan++;
+
+        len = (MAX_MATCH - 1) - (int)(strend-scan);
+        scan = strend - (MAX_MATCH-1);
+
+#else /* UNALIGNED_OK */
+
+        if (match[best_len]   != scan_end  ||
+            match[best_len-1] != scan_end1 ||
+            *match            != *scan     ||
+            *++match          != scan[1])      continue;
+
+        /* The check at best_len-1 can be removed because it will be made
+         * again later. (This heuristic is not always a win.)
+         * It is not necessary to compare scan[2] and match[2] since they
+         * are always equal when the other bytes match, given that
+         * the hash keys are equal and that HASH_BITS >= 8.
+         */
+        scan += 2, match++;
+        Assert(*scan == *match, "match[2]?");
+
+        /* We check for insufficient lookahead only every 8th comparison;
+         * the 256th check will be made at strstart+258.
+         */
+        do {
+        } while (*++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 scan < strend);
+
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+        len = MAX_MATCH - (int)(strend - scan);
+        scan = strend - MAX_MATCH;
+
+#endif /* UNALIGNED_OK */
+
+        if (len > best_len) {
+            s->match_start = cur_match;
+            best_len = len;
+            if (len >= nice_match) break;
+#ifdef UNALIGNED_OK
+            scan_end = *(ushf*)(scan+best_len-1);
+#else
+            scan_end1  = scan[best_len-1];
+            scan_end   = scan[best_len];
+#endif
+        }
+    } while ((cur_match = prev[cur_match & wmask]) > limit
+             && --chain_length != 0);
+
+    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+    return s->lookahead;
+}
+#endif /* ASMV */
+
+#else /* FASTEST */
+
+/* ---------------------------------------------------------------------------
+ * Optimized version for FASTEST only
+ */
+local uInt longest_match(s, cur_match)
+    deflate_state *s;
+    IPos cur_match;                             /* current match */
+{
+    register Bytef *scan = s->window + s->strstart; /* current string */
+    register Bytef *match;                       /* matched string */
+    register int len;                           /* length of current match */
+    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+     * It is easy to get rid of this optimization if necessary.
+     */
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+    Assert(cur_match < s->strstart, "no future");
+
+    match = s->window + cur_match;
+
+    /* Return failure if the match length is less than 2:
+     */
+    if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
+
+    /* The check at best_len-1 can be removed because it will be made
+     * again later. (This heuristic is not always a win.)
+     * It is not necessary to compare scan[2] and match[2] since they
+     * are always equal when the other bytes match, given that
+     * the hash keys are equal and that HASH_BITS >= 8.
+     */
+    scan += 2, match += 2;
+    Assert(*scan == *match, "match[2]?");
+
+    /* We check for insufficient lookahead only every 8th comparison;
+     * the 256th check will be made at strstart+258.
+     */
+    do {
+    } while (*++scan == *++match && *++scan == *++match &&
+             *++scan == *++match && *++scan == *++match &&
+             *++scan == *++match && *++scan == *++match &&
+             *++scan == *++match && *++scan == *++match &&
+             scan < strend);
+
+    Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+    len = MAX_MATCH - (int)(strend - scan);
+
+    if (len < MIN_MATCH) return MIN_MATCH - 1;
+
+    s->match_start = cur_match;
+    return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead;
+}
+
+#endif /* FASTEST */
+
+#ifdef ZLIB_DEBUG
+
+#define EQUAL 0
+/* result of memcmp for equal strings */
+
+/* ===========================================================================
+ * Check that the match at match_start is indeed a match.
+ */
+local void check_match(s, start, match, length)
+    deflate_state *s;
+    IPos start, match;
+    int length;
+{
+    /* check that the match is indeed a match */
+    if (zmemcmp(s->window + match,
+                s->window + start, length) != EQUAL) {
+        fprintf(stderr, " start %u, match %u, length %d\n",
+                start, match, length);
+        do {
+            fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
+        } while (--length != 0);
+        z_error("invalid match");
+    }
+    if (z_verbose > 1) {
+        fprintf(stderr,"\\[%d,%d]", start-match, length);
+        do { putc(s->window[start++], stderr); } while (--length != 0);
+    }
+}
+#else
+#  define check_match(s, start, match, length)
+#endif /* ZLIB_DEBUG */
+
+/* ===========================================================================
+ * Fill the window when the lookahead becomes insufficient.
+ * Updates strstart and lookahead.
+ *
+ * IN assertion: lookahead < MIN_LOOKAHEAD
+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
+ *    At least one byte has been read, or avail_in == 0; reads are
+ *    performed for at least two bytes (required for the zip translate_eol
+ *    option -- not supported here).
+ */
+local void fill_window(s)
+    deflate_state *s;
+{
+    unsigned n;
+    unsigned more;    /* Amount of free space at the end of the window. */
+    uInt wsize = s->w_size;
+
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+    do {
+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+        /* Deal with !@#$% 64K limit: */
+        if (sizeof(int) <= 2) {
+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+                more = wsize;
+
+            } else if (more == (unsigned)(-1)) {
+                /* Very unlikely, but possible on 16 bit machine if
+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
+                 */
+                more--;
+            }
+        }
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        if (s->strstart >= wsize+MAX_DIST(s)) {
+
+            zmemcpy(s->window, s->window+wsize, (unsigned)wsize - more);
+            s->match_start -= wsize;
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (long) wsize;
+            slide_hash(s);
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0) break;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead + s->insert >= MIN_MATCH) {
+            uInt str = s->strstart - s->insert;
+            s->ins_h = s->window[str];
+            UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
+#if MIN_MATCH != 3
+            Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+            while (s->insert) {
+                UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+#ifndef FASTEST
+                s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+                s->head[s->ins_h] = (Pos)str;
+                str++;
+                s->insert--;
+                if (s->lookahead + s->insert < MIN_MATCH)
+                    break;
+            }
+        }
+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        ulg curr = s->strstart + (ulg)(s->lookahead);
+        ulg init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            zmemzero(s->window + curr, (unsigned)init);
+            s->high_water = curr + init;
+        }
+        else if (s->high_water < (ulg)curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = (ulg)curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            zmemzero(s->window + s->high_water, (unsigned)init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "not enough room for search");
+}
+
+/* ===========================================================================
+ * Flush the current block, with given end-of-file flag.
+ * IN assertion: strstart is set to the end of the current match.
+ */
+#define FLUSH_BLOCK_ONLY(s, last) { \
+   _tr_flush_block(s, (s->block_start >= 0L ? \
+                   (charf *)&s->window[(unsigned)s->block_start] : \
+                   (charf *)Z_NULL), \
+                (ulg)((long)s->strstart - s->block_start), \
+                (last)); \
+   s->block_start = s->strstart; \
+   flush_pending(s->strm); \
+   Tracev((stderr,"[FLUSH]")); \
+}
+
+/* Same but force premature exit if necessary. */
+#define FLUSH_BLOCK(s, last) { \
+   FLUSH_BLOCK_ONLY(s, last); \
+   if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
+}
+
+/* Maximum stored block length in deflate format (not including header). */
+#define MAX_STORED 65535
+
+/* Minimum of a and b. */
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+
+/* ===========================================================================
+ * Copy without compression as much as possible from the input stream, return
+ * the current block state.
+ *
+ * In case deflateParams() is used to later switch to a non-zero compression
+ * level, s->matches (otherwise unused when storing) keeps track of the number
+ * of hash table slides to perform. If s->matches is 1, then one hash table
+ * slide will be done when switching. If s->matches is 2, the maximum value
+ * allowed here, then the hash table will be cleared, since two or more slides
+ * is the same as a clear.
+ *
+ * deflate_stored() is written to minimize the number of times an input byte is
+ * copied. It is most efficient with large input and output buffers, which
+ * maximizes the opportunites to have a single copy from next_in to next_out.
+ */
+local block_state deflate_stored(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    /* Smallest worthy block size when not flushing or finishing. By default
+     * this is 32K. This can be as small as 507 bytes for memLevel == 1. For
+     * large input and output buffers, the stored block size will be larger.
+     */
+    unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
+
+    /* Copy as many min_block or larger stored blocks directly to next_out as
+     * possible. If flushing, copy the remaining available input to next_out as
+     * stored blocks, if there is enough space.
+     */
+    unsigned len, left, have, last = 0;
+    unsigned used = s->strm->avail_in;
+    do {
+        /* Set len to the maximum size block that we can copy directly with the
+         * available input data and output space. Set left to how much of that
+         * would be copied from what's left in the window.
+         */
+        len = MAX_STORED;       /* maximum deflate stored block length */
+        have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        if (s->strm->avail_out < have)          /* need room for header */
+            break;
+            /* maximum stored block length that will fit in avail_out: */
+        have = s->strm->avail_out - have;
+        left = s->strstart - s->block_start;    /* bytes left in window */
+        if (len > (ulg)left + s->strm->avail_in)
+            len = left + s->strm->avail_in;     /* limit len to the input */
+        if (len > have)
+            len = have;                         /* limit len to the output */
+
+        /* If the stored block would be less than min_block in length, or if
+         * unable to copy all of the available input when flushing, then try
+         * copying to the window and the pending buffer instead. Also don't
+         * write an empty block when flushing -- deflate() does that.
+         */
+        if (len < min_block && ((len == 0 && flush != Z_FINISH) ||
+                                flush == Z_NO_FLUSH ||
+                                len != left + s->strm->avail_in))
+            break;
+
+        /* Make a dummy stored block in pending to get the header bytes,
+         * including any pending bits. This also updates the debugging counts.
+         */
+        last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0;
+        _tr_stored_block(s, (char *)0, 0L, last);
+
+        /* Replace the lengths in the dummy stored block with len. */
+        s->pending_buf[s->pending - 4] = len;
+        s->pending_buf[s->pending - 3] = len >> 8;
+        s->pending_buf[s->pending - 2] = ~len;
+        s->pending_buf[s->pending - 1] = ~len >> 8;
+
+        /* Write the stored block header bytes. */
+        flush_pending(s->strm);
+
+#ifdef ZLIB_DEBUG
+        /* Update debugging counts for the data about to be copied. */
+        s->compressed_len += len << 3;
+        s->bits_sent += len << 3;
+#endif
+
+        /* Copy uncompressed bytes from the window to next_out. */
+        if (left) {
+            if (left > len)
+                left = len;
+            zmemcpy(s->strm->next_out, s->window + s->block_start, left);
+            s->strm->next_out += left;
+            s->strm->avail_out -= left;
+            s->strm->total_out += left;
+            s->block_start += left;
+            len -= left;
+        }
+
+        /* Copy uncompressed bytes directly from next_in to next_out, updating
+         * the check value.
+         */
+        if (len) {
+            read_buf(s->strm, s->strm->next_out, len);
+            s->strm->next_out += len;
+            s->strm->avail_out -= len;
+            s->strm->total_out += len;
+        }
+    } while (last == 0);
+
+    /* Update the sliding window with the last s->w_size bytes of the copied
+     * data, or append all of the copied data to the existing window if less
+     * than s->w_size bytes were copied. Also update the number of bytes to
+     * insert in the hash tables, in the event that deflateParams() switches to
+     * a non-zero compression level.
+     */
+    used -= s->strm->avail_in;      /* number of input bytes directly copied */
+    if (used) {
+        /* If any input was used, then no unused input remains in the window,
+         * therefore s->block_start == s->strstart.
+         */
+        if (used >= s->w_size) {    /* supplant the previous history */
+            s->matches = 2;         /* clear hash */
+            zmemcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
+            s->strstart = s->w_size;
+        }
+        else {
+            if (s->window_size - s->strstart <= used) {
+                /* Slide the window down. */
+                s->strstart -= s->w_size;
+                zmemcpy(s->window, s->window + s->w_size, s->strstart);
+                if (s->matches < 2)
+                    s->matches++;   /* add a pending slide_hash() */
+            }
+            zmemcpy(s->window + s->strstart, s->strm->next_in - used, used);
+            s->strstart += used;
+        }
+        s->block_start = s->strstart;
+        s->insert += MIN(used, s->w_size - s->insert);
+    }
+    if (s->high_water < s->strstart)
+        s->high_water = s->strstart;
+
+    /* If the last block was written to next_out, then done. */
+    if (last)
+        return finish_done;
+
+    /* If flushing and all input has been consumed, then done. */
+    if (flush != Z_NO_FLUSH && flush != Z_FINISH &&
+        s->strm->avail_in == 0 && (long)s->strstart == s->block_start)
+        return block_done;
+
+    /* Fill the window with any remaining input. */
+    have = s->window_size - s->strstart - 1;
+    if (s->strm->avail_in > have && s->block_start >= (long)s->w_size) {
+        /* Slide the window down. */
+        s->block_start -= s->w_size;
+        s->strstart -= s->w_size;
+        zmemcpy(s->window, s->window + s->w_size, s->strstart);
+        if (s->matches < 2)
+            s->matches++;           /* add a pending slide_hash() */
+        have += s->w_size;          /* more space now */
+    }
+    if (have > s->strm->avail_in)
+        have = s->strm->avail_in;
+    if (have) {
+        read_buf(s->strm, s->window + s->strstart, have);
+        s->strstart += have;
+    }
+    if (s->high_water < s->strstart)
+        s->high_water = s->strstart;
+
+    /* There was not enough avail_out to write a complete worthy or flushed
+     * stored block to next_out. Write a stored block to pending instead, if we
+     * have enough input for a worthy block, or if flushing and there is enough
+     * room for the remaining input as a stored block in the pending buffer.
+     */
+    have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        /* maximum stored block length that will fit in pending: */
+    have = MIN(s->pending_buf_size - have, MAX_STORED);
+    min_block = MIN(have, s->w_size);
+    left = s->strstart - s->block_start;
+    if (left >= min_block ||
+        ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH &&
+         s->strm->avail_in == 0 && left <= have)) {
+        len = MIN(left, have);
+        last = flush == Z_FINISH && s->strm->avail_in == 0 &&
+               len == left ? 1 : 0;
+        _tr_stored_block(s, (charf *)s->window + s->block_start, len, last);
+        s->block_start += len;
+        flush_pending(s->strm);
+    }
+
+    /* We've done all we can with the available input and output. */
+    return last ? finish_started : need_more;
+}
+
+/* ===========================================================================
+ * Compress as much as possible from the input stream, return the current
+ * block state.
+ * This function does not perform lazy evaluation of matches and inserts
+ * new strings in the dictionary only for unmatched strings or for short
+ * matches. It is used only for the fast compression options.
+ */
+local block_state deflate_fast(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    IPos hash_head;       /* head of the hash chain */
+    int bflush;           /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need MAX_MATCH bytes
+         * for the next match, plus MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            fill_window(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+                return need_more;
+            }
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        hash_head = NIL;
+        if (s->lookahead >= MIN_MATCH) {
+            INSERT_STRING(s, s->strstart, hash_head);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         * At this point we have always match_length < MIN_MATCH
+         */
+        if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            s->match_length = longest_match (s, hash_head);
+            /* longest_match() sets match_start */
+        }
+        if (s->match_length >= MIN_MATCH) {
+            check_match(s, s->strstart, s->match_start, s->match_length);
+
+            _tr_tally_dist(s, s->strstart - s->match_start,
+                           s->match_length - MIN_MATCH, bflush);
+
+            s->lookahead -= s->match_length;
+
+            /* Insert new strings in the hash table only if the match length
+             * is not too large. This saves time but degrades compression.
+             */
+#ifndef FASTEST
+            if (s->match_length <= s->max_insert_length &&
+                s->lookahead >= MIN_MATCH) {
+                s->match_length--; /* string at strstart already in table */
+                do {
+                    s->strstart++;
+                    INSERT_STRING(s, s->strstart, hash_head);
+                    /* strstart never exceeds WSIZE-MAX_MATCH, so there are
+                     * always MIN_MATCH bytes ahead.
+                     */
+                } while (--s->match_length != 0);
+                s->strstart++;
+            } else
+#endif
+            {
+                s->strstart += s->match_length;
+                s->match_length = 0;
+                s->ins_h = s->window[s->strstart];
+                UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+#if MIN_MATCH != 3
+                Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+                /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
+                 * matter since it will be recomputed at next deflate call.
+                 */
+            }
+        } else {
+            /* No match, output a literal byte */
+            Tracevv((stderr,"%c", s->window[s->strstart]));
+            _tr_tally_lit (s, s->window[s->strstart], bflush);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (bflush) FLUSH_BLOCK(s, 0);
+    }
+    s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
+
+#ifndef FASTEST
+/* ===========================================================================
+ * Same as above, but achieves better compression. We use a lazy
+ * evaluation for matches: a match is finally adopted only if there is
+ * no better match at the next window position.
+ */
+local block_state deflate_slow(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    IPos hash_head;          /* head of hash chain */
+    int bflush;              /* set if current block must be flushed */
+
+    /* Process the input block. */
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need MAX_MATCH bytes
+         * for the next match, plus MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            fill_window(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+                return need_more;
+            }
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        hash_head = NIL;
+        if (s->lookahead >= MIN_MATCH) {
+            INSERT_STRING(s, s->strstart, hash_head);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         */
+        s->prev_length = s->match_length, s->prev_match = s->match_start;
+        s->match_length = MIN_MATCH-1;
+
+        if (hash_head != NIL && s->prev_length < s->max_lazy_match &&
+            s->strstart - hash_head <= MAX_DIST(s)) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            s->match_length = longest_match (s, hash_head);
+            /* longest_match() sets match_start */
+
+            if (s->match_length <= 5 && (s->strategy == Z_FILTERED
+#if TOO_FAR <= 32767
+                || (s->match_length == MIN_MATCH &&
+                    s->strstart - s->match_start > TOO_FAR)
+#endif
+                )) {
+
+                /* If prev_match is also MIN_MATCH, match_start is garbage
+                 * but we will ignore the current match anyway.
+                 */
+                s->match_length = MIN_MATCH-1;
+            }
+        }
+        /* If there was a match at the previous step and the current
+         * match is not better, output the previous match:
+         */
+        if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) {
+            uInt max_insert = s->strstart + s->lookahead - MIN_MATCH;
+            /* Do not insert strings in hash table beyond this. */
+
+            check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+
+            _tr_tally_dist(s, s->strstart -1 - s->prev_match,
+                           s->prev_length - MIN_MATCH, bflush);
+
+            /* Insert in hash table all strings up to the end of the match.
+             * strstart-1 and strstart are already inserted. If there is not
+             * enough lookahead, the last two strings are not inserted in
+             * the hash table.
+             */
+            s->lookahead -= s->prev_length-1;
+            s->prev_length -= 2;
+            do {
+                if (++s->strstart <= max_insert) {
+                    INSERT_STRING(s, s->strstart, hash_head);
+                }
+            } while (--s->prev_length != 0);
+            s->match_available = 0;
+            s->match_length = MIN_MATCH-1;
+            s->strstart++;
+
+            if (bflush) FLUSH_BLOCK(s, 0);
+
+        } else if (s->match_available) {
+            /* If there was no match at the previous position, output a
+             * single literal. If there was a match but the current match
+             * is longer, truncate the previous match to a single literal.
+             */
+            Tracevv((stderr,"%c", s->window[s->strstart-1]));
+            _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+            if (bflush) {
+                FLUSH_BLOCK_ONLY(s, 0);
+            }
+            s->strstart++;
+            s->lookahead--;
+            if (s->strm->avail_out == 0) return need_more;
+        } else {
+            /* There is no previous match to compare with, wait for
+             * the next step to decide.
+             */
+            s->match_available = 1;
+            s->strstart++;
+            s->lookahead--;
+        }
+    }
+    Assert (flush != Z_NO_FLUSH, "no flush?");
+    if (s->match_available) {
+        Tracevv((stderr,"%c", s->window[s->strstart-1]));
+        _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+        s->match_available = 0;
+    }
+    s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
+#endif /* FASTEST */
+
+/* ===========================================================================
+ * For Z_RLE, simply look for runs of bytes, generate matches only of distance
+ * one.  Do not maintain a hash table.  (It will be regenerated if this run of
+ * deflate switches away from Z_RLE.)
+ */
+local block_state deflate_rle(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    int bflush;             /* set if current block must be flushed */
+    uInt prev;              /* byte at distance one to match */
+    Bytef *scan, *strend;   /* scan goes up to strend for length of run */
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need MAX_MATCH bytes
+         * for the longest run, plus one for the unrolled loop.
+         */
+        if (s->lookahead <= MAX_MATCH) {
+            fill_window(s);
+            if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) {
+                return need_more;
+            }
+            if (s->lookahead == 0) break; /* flush the current block */
+        }
+
+        /* See how many times the previous byte repeats */
+        s->match_length = 0;
+        if (s->lookahead >= MIN_MATCH && s->strstart > 0) {
+            scan = s->window + s->strstart - 1;
+            prev = *scan;
+            if (prev == *++scan && prev == *++scan && prev == *++scan) {
+                strend = s->window + s->strstart + MAX_MATCH;
+                do {
+                } while (prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         scan < strend);
+                s->match_length = MAX_MATCH - (uInt)(strend - scan);
+                if (s->match_length > s->lookahead)
+                    s->match_length = s->lookahead;
+            }
+            Assert(scan <= s->window+(uInt)(s->window_size-1), "wild scan");
+        }
+
+        /* Emit match if have run of MIN_MATCH or longer, else emit literal */
+        if (s->match_length >= MIN_MATCH) {
+            check_match(s, s->strstart, s->strstart - 1, s->match_length);
+
+            _tr_tally_dist(s, 1, s->match_length - MIN_MATCH, bflush);
+
+            s->lookahead -= s->match_length;
+            s->strstart += s->match_length;
+            s->match_length = 0;
+        } else {
+            /* No match, output a literal byte */
+            Tracevv((stderr,"%c", s->window[s->strstart]));
+            _tr_tally_lit (s, s->window[s->strstart], bflush);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (bflush) FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
+
+/* ===========================================================================
+ * For Z_HUFFMAN_ONLY, do not look for matches.  Do not maintain a hash table.
+ * (It will be regenerated if this run of deflate switches away from Huffman.)
+ */
+local block_state deflate_huff(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    int bflush;             /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we have a literal to write. */
+        if (s->lookahead == 0) {
+            fill_window(s);
+            if (s->lookahead == 0) {
+                if (flush == Z_NO_FLUSH)
+                    return need_more;
+                break;      /* flush the current block */
+            }
+        }
+
+        /* Output a literal byte */
+        s->match_length = 0;
+        Tracevv((stderr,"%c", s->window[s->strstart]));
+        _tr_tally_lit (s, s->window[s->strstart], bflush);
+        s->lookahead--;
+        s->strstart++;
+        if (bflush) FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/deps/SZ/zlib/deflate.h b/deps/SZ/zlib/deflate.h
new file mode 100644
index 0000000000000000000000000000000000000000..23ecdd312bc06eb41a40dce73358e62dea8772d2
--- /dev/null
+++ b/deps/SZ/zlib/deflate.h
@@ -0,0 +1,349 @@
+/* deflate.h -- internal compression state
+ * Copyright (C) 1995-2016 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* @(#) $Id$ */
+
+#ifndef DEFLATE_H
+#define DEFLATE_H
+
+#include "zutil.h"
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+   trailer creation by deflate().  NO_GZIP would be used to avoid linking in
+   the crc code when it is not needed.  For shared libraries, gzip encoding
+   should be left enabled. */
+#ifndef NO_GZIP
+#  define GZIP
+#endif
+
+/* ===========================================================================
+ * Internal compression state.
+ */
+
+#define LENGTH_CODES 29
+/* number of length codes, not counting the special END_BLOCK code */
+
+#define LITERALS  256
+/* number of literal bytes 0..255 */
+
+#define L_CODES (LITERALS+1+LENGTH_CODES)
+/* number of Literal or Length codes, including the END_BLOCK code */
+
+#define D_CODES   30
+/* number of distance codes */
+
+#define BL_CODES  19
+/* number of codes used to transfer the bit lengths */
+
+#define HEAP_SIZE (2*L_CODES+1)
+/* maximum heap size */
+
+#define MAX_BITS 15
+/* All codes must not exceed MAX_BITS bits */
+
+#define Buf_size 16
+/* size of bit buffer in bi_buf */
+
+#define INIT_STATE    42    /* zlib header -> BUSY_STATE */
+#ifdef GZIP
+#  define GZIP_STATE  57    /* gzip header -> BUSY_STATE | EXTRA_STATE */
+#endif
+#define EXTRA_STATE   69    /* gzip extra block -> NAME_STATE */
+#define NAME_STATE    73    /* gzip file name -> COMMENT_STATE */
+#define COMMENT_STATE 91    /* gzip comment -> HCRC_STATE */
+#define HCRC_STATE   103    /* gzip header CRC -> BUSY_STATE */
+#define BUSY_STATE   113    /* deflate -> FINISH_STATE */
+#define FINISH_STATE 666    /* stream complete */
+/* Stream status */
+
+
+/* Data structure describing a single value and its code string. */
+typedef struct ct_data_s {
+    union {
+        ush  freq;       /* frequency count */
+        ush  code;       /* bit string */
+    } fc;
+    union {
+        ush  dad;        /* father node in Huffman tree */
+        ush  len;        /* length of bit string */
+    } dl;
+} FAR ct_data;
+
+#define Freq fc.freq
+#define Code fc.code
+#define Dad  dl.dad
+#define Len  dl.len
+
+typedef struct static_tree_desc_s  static_tree_desc;
+
+typedef struct tree_desc_s {
+    ct_data *dyn_tree;           /* the dynamic tree */
+    int     max_code;            /* largest code with non zero frequency */
+    const static_tree_desc *stat_desc;  /* the corresponding static tree */
+} FAR tree_desc;
+
+typedef ush Pos;
+typedef Pos FAR Posf;
+typedef unsigned IPos;
+
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables. IPos is used only for parameter passing.
+ */
+
+typedef struct internal_state {
+    z_streamp strm;      /* pointer back to this zlib stream */
+    int   status;        /* as the name implies */
+    Bytef *pending_buf;  /* output still pending */
+    ulg   pending_buf_size; /* size of pending_buf */
+    Bytef *pending_out;  /* next pending byte to output to the stream */
+    ulg   pending;       /* nb of bytes in the pending buffer */
+    int   wrap;          /* bit 0 true for zlib, bit 1 true for gzip */
+    gz_headerp  gzhead;  /* gzip header information to write */
+    ulg   gzindex;       /* where in extra, name, or comment */
+    Byte  method;        /* can only be DEFLATED */
+    int   last_flush;    /* value of flush param for previous deflate call */
+
+                /* used by deflate.c: */
+
+    uInt  w_size;        /* LZ77 window size (32K by default) */
+    uInt  w_bits;        /* log2(w_size)  (8..16) */
+    uInt  w_mask;        /* w_size - 1 */
+
+    Bytef *window;
+    /* Sliding window. Input bytes are read into the second half of the window,
+     * and move to the first half later to keep a dictionary of at least wSize
+     * bytes. With this organization, matches are limited to a distance of
+     * wSize-MAX_MATCH bytes, but this ensures that IO is always
+     * performed with a length multiple of the block size. Also, it limits
+     * the window size to 64K, which is quite useful on MSDOS.
+     * To do: use the user input buffer as sliding window.
+     */
+
+    ulg window_size;
+    /* Actual size of window: 2*wSize, except when the user input buffer
+     * is directly used as sliding window.
+     */
+
+    Posf *prev;
+    /* Link to older string with same hash index. To limit the size of this
+     * array to 64K, this link is maintained only for the last 32K strings.
+     * An index in this array is thus a window index modulo 32K.
+     */
+
+    Posf *head; /* Heads of the hash chains or NIL. */
+
+    uInt  ins_h;          /* hash index of string to be inserted */
+    uInt  hash_size;      /* number of elements in hash table */
+    uInt  hash_bits;      /* log2(hash_size) */
+    uInt  hash_mask;      /* hash_size-1 */
+
+    uInt  hash_shift;
+    /* Number of bits by which ins_h must be shifted at each input
+     * step. It must be such that after MIN_MATCH steps, the oldest
+     * byte no longer takes part in the hash key, that is:
+     *   hash_shift * MIN_MATCH >= hash_bits
+     */
+
+    long block_start;
+    /* Window position at the beginning of the current output block. Gets
+     * negative when the window is moved backwards.
+     */
+
+    uInt match_length;           /* length of best match */
+    IPos prev_match;             /* previous match */
+    int match_available;         /* set if previous match exists */
+    uInt strstart;               /* start of string to insert */
+    uInt match_start;            /* start of matching string */
+    uInt lookahead;              /* number of valid bytes ahead in window */
+
+    uInt prev_length;
+    /* Length of the best match at previous step. Matches not greater than this
+     * are discarded. This is used in the lazy match evaluation.
+     */
+
+    uInt max_chain_length;
+    /* To speed up deflation, hash chains are never searched beyond this
+     * length.  A higher limit improves compression ratio but degrades the
+     * speed.
+     */
+
+    uInt max_lazy_match;
+    /* Attempt to find a better match only when the current match is strictly
+     * smaller than this value. This mechanism is used only for compression
+     * levels >= 4.
+     */
+#   define max_insert_length  max_lazy_match
+    /* Insert new strings in the hash table only if the match length is not
+     * greater than this length. This saves time but degrades compression.
+     * max_insert_length is used only for compression levels <= 3.
+     */
+
+    int level;    /* compression level (1..9) */
+    int strategy; /* favor or force Huffman coding*/
+
+    uInt good_match;
+    /* Use a faster search when the previous match is longer than this */
+
+    int nice_match; /* Stop searching when current match exceeds this */
+
+                /* used by trees.c: */
+    /* Didn't use ct_data typedef below to suppress compiler warning */
+    struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
+    struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
+    struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
+
+    struct tree_desc_s l_desc;               /* desc. for literal tree */
+    struct tree_desc_s d_desc;               /* desc. for distance tree */
+    struct tree_desc_s bl_desc;              /* desc. for bit length tree */
+
+    ush bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    int heap[2*L_CODES+1];      /* heap used to build the Huffman trees */
+    int heap_len;               /* number of elements in the heap */
+    int heap_max;               /* element of largest frequency */
+    /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
+     * The same heap array is used to build all trees.
+     */
+
+    uch depth[2*L_CODES+1];
+    /* Depth of each subtree used as tie breaker for trees of equal frequency
+     */
+
+    uchf *l_buf;          /* buffer for literals or lengths */
+
+    uInt  lit_bufsize;
+    /* Size of match buffer for literals/lengths.  There are 4 reasons for
+     * limiting lit_bufsize to 64K:
+     *   - frequencies can be kept in 16 bit counters
+     *   - if compression is not successful for the first block, all input
+     *     data is still in the window so we can still emit a stored block even
+     *     when input comes from standard input.  (This can also be done for
+     *     all blocks if lit_bufsize is not greater than 32K.)
+     *   - if compression is not successful for a file smaller than 64K, we can
+     *     even emit a stored file instead of a stored block (saving 5 bytes).
+     *     This is applicable only for zip (not gzip or zlib).
+     *   - creating new Huffman trees less frequently may not provide fast
+     *     adaptation to changes in the input data statistics. (Take for
+     *     example a binary file with poorly compressible code followed by
+     *     a highly compressible string table.) Smaller buffer sizes give
+     *     fast adaptation but have of course the overhead of transmitting
+     *     trees more frequently.
+     *   - I can't count above 4
+     */
+
+    uInt last_lit;      /* running index in l_buf */
+
+    ushf *d_buf;
+    /* Buffer for distances. To simplify the code, d_buf and l_buf have
+     * the same number of elements. To use different lengths, an extra flag
+     * array would be necessary.
+     */
+
+    ulg opt_len;        /* bit length of current block with optimal trees */
+    ulg static_len;     /* bit length of current block with static trees */
+    uInt matches;       /* number of string matches in current block */
+    uInt insert;        /* bytes at end of window left to insert */
+
+#ifdef ZLIB_DEBUG
+    ulg compressed_len; /* total bit length of compressed file mod 2^32 */
+    ulg bits_sent;      /* bit length of compressed data sent mod 2^32 */
+#endif
+
+    ush bi_buf;
+    /* Output buffer. bits are inserted starting at the bottom (least
+     * significant bits).
+     */
+    int bi_valid;
+    /* Number of valid bits in bi_buf.  All bits above the last valid bit
+     * are always zero.
+     */
+
+    ulg high_water;
+    /* High water mark offset in window for initialized bytes -- bytes above
+     * this are set to zero in order to avoid memory check warnings when
+     * longest match routines access bytes past the input.  This is then
+     * updated to the new high water mark.
+     */
+
+} FAR deflate_state;
+
+/* Output a byte on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+#define put_byte(s, c) {s->pending_buf[s->pending++] = (Bytef)(c);}
+
+
+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the MIN_MATCH+1.
+ */
+
+#define MAX_DIST(s)  ((s)->w_size-MIN_LOOKAHEAD)
+/* In order to simplify the code, particularly on 16 bit machines, match
+ * distances are limited to MAX_DIST instead of WSIZE.
+ */
+
+#define WIN_INIT MAX_MATCH
+/* Number of bytes after end of data in window to initialize in order to avoid
+   memory checker errors from longest match routines */
+
+        /* in trees.c */
+void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
+int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
+void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
+                        ulg stored_len, int last));
+void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
+void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
+void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
+                        ulg stored_len, int last));
+
+#define d_code(dist) \
+   ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
+/* Mapping from a distance to a distance code. dist is the distance - 1 and
+ * must not have side effects. _dist_code[256] and _dist_code[257] are never
+ * used.
+ */
+
+#ifndef ZLIB_DEBUG
+/* Inline versions of _tr_tally for speed: */
+
+#if defined(GEN_TREES_H) || !defined(STDC)
+  extern uch ZLIB_INTERNAL _length_code[];
+  extern uch ZLIB_INTERNAL _dist_code[];
+#else
+  extern const uch ZLIB_INTERNAL _length_code[];
+  extern const uch ZLIB_INTERNAL _dist_code[];
+#endif
+
+# define _tr_tally_lit(s, c, flush) \
+  { uch cc = (c); \
+    s->d_buf[s->last_lit] = 0; \
+    s->l_buf[s->last_lit++] = cc; \
+    s->dyn_ltree[cc].Freq++; \
+    flush = (s->last_lit == s->lit_bufsize-1); \
+   }
+# define _tr_tally_dist(s, distance, length, flush) \
+  { uch len = (uch)(length); \
+    ush dist = (ush)(distance); \
+    s->d_buf[s->last_lit] = dist; \
+    s->l_buf[s->last_lit++] = len; \
+    dist--; \
+    s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
+    s->dyn_dtree[d_code(dist)].Freq++; \
+    flush = (s->last_lit == s->lit_bufsize-1); \
+  }
+#else
+# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
+# define _tr_tally_dist(s, distance, length, flush) \
+              flush = _tr_tally(s, distance, length)
+#endif
+
+#endif /* DEFLATE_H */
diff --git a/deps/SZ/zlib/gzclose.c b/deps/SZ/zlib/gzclose.c
new file mode 100644
index 0000000000000000000000000000000000000000..caeb99a3177f477d622870255a00ac2b72f10cad
--- /dev/null
+++ b/deps/SZ/zlib/gzclose.c
@@ -0,0 +1,25 @@
+/* gzclose.c -- zlib gzclose() function
+ * Copyright (C) 2004, 2010 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "gzguts.h"
+
+/* gzclose() is in a separate file so that it is linked in only if it is used.
+   That way the other gzclose functions can be used instead to avoid linking in
+   unneeded compression or decompression routines. */
+int ZEXPORT gzclose(file)
+    gzFile file;
+{
+#ifndef NO_GZCOMPRESS
+    gz_statep state;
+
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    return state->mode == GZ_READ ? gzclose_r(file) : gzclose_w(file);
+#else
+    return gzclose_r(file);
+#endif
+}
diff --git a/deps/SZ/zlib/gzguts.h b/deps/SZ/zlib/gzguts.h
new file mode 100644
index 0000000000000000000000000000000000000000..990a4d2514933709883a7d949ed52146675fe2c1
--- /dev/null
+++ b/deps/SZ/zlib/gzguts.h
@@ -0,0 +1,218 @@
+/* gzguts.h -- zlib internal header definitions for gz* operations
+ * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef _LARGEFILE64_SOURCE
+#  ifndef _LARGEFILE_SOURCE
+#    define _LARGEFILE_SOURCE 1
+#  endif
+#  ifdef _FILE_OFFSET_BITS
+#    undef _FILE_OFFSET_BITS
+#  endif
+#endif
+
+#ifdef HAVE_HIDDEN
+#  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define ZLIB_INTERNAL
+#endif
+
+#include <stdio.h>
+#include "zlib.h"
+#ifdef STDC
+#  include <string.h>
+#  include <stdlib.h>
+#  include <limits.h>
+#endif
+
+#ifndef _POSIX_SOURCE
+#  define _POSIX_SOURCE
+#endif
+#include <fcntl.h>
+
+#ifdef _WIN32
+#  include <stddef.h>
+#endif
+
+#if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32)
+#  include <io.h>
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define WIDECHAR
+#endif
+
+#ifdef WINAPI_FAMILY
+#  define open _open
+#  define read _read
+#  define write _write
+#  define close _close
+#endif
+
+#ifdef NO_DEFLATE       /* for compatibility with old definition */
+#  define NO_GZCOMPRESS
+#endif
+
+#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(__CYGWIN__)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#ifndef HAVE_VSNPRINTF
+#  ifdef MSDOS
+/* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
+   but for now we just assume it doesn't. */
+#    define NO_vsnprintf
+#  endif
+#  ifdef __TURBOC__
+#    define NO_vsnprintf
+#  endif
+#  ifdef WIN32
+/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#    if !defined(vsnprintf) && !defined(NO_vsnprintf)
+#      if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
+#         define vsnprintf _vsnprintf
+#      endif
+#    endif
+#  endif
+#  ifdef __SASC
+#    define NO_vsnprintf
+#  endif
+#  ifdef VMS
+#    define NO_vsnprintf
+#  endif
+#  ifdef __OS400__
+#    define NO_vsnprintf
+#  endif
+#  ifdef __MVS__
+#    define NO_vsnprintf
+#  endif
+#endif
+
+/* unlike snprintf (which is required in C99), _snprintf does not guarantee
+   null termination of the result -- however this is only used in gzlib.c where
+   the result is assured to fit in the space provided */
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#  define snprintf _snprintf
+#endif
+
+#ifndef local
+#  define local static
+#endif
+/* since "static" is used to mean two completely different things in C, we
+   define "local" for the non-static meaning of "static", for readability
+   (compile with -Dlocal if your debugger can't find static symbols) */
+
+/* gz* functions always use library allocation functions */
+#ifndef STDC
+  extern voidp  malloc OF((uInt size));
+  extern void   free   OF((voidpf ptr));
+#endif
+
+/* get errno and strerror definition */
+#if defined UNDER_CE
+#  include <windows.h>
+#  define zstrerror() gz_strwinerror((DWORD)GetLastError())
+#else
+#  ifndef NO_STRERROR
+#    include <errno.h>
+#    define zstrerror() strerror(errno)
+#  else
+#    define zstrerror() "stdio error (consult errno)"
+#  endif
+#endif
+
+/* provide prototypes for these when building zlib without LFS */
+#if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0
+    ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+    ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+    ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+    ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+/* default i/o buffer size -- double this for output when reading (this and
+   twice this must be able to fit in an unsigned type) */
+#define GZBUFSIZE 8192
+
+/* gzip modes, also provide a little integrity check on the passed structure */
+#define GZ_NONE 0
+#define GZ_READ 7247
+#define GZ_WRITE 31153
+#define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
+
+/* values for gz_state how */
+#define LOOK 0      /* look for a gzip header */
+#define COPY 1      /* copy input directly */
+#define GZIP 2      /* decompress a gzip stream */
+
+/* internal gzip file state data structure */
+typedef struct {
+        /* exposed contents for gzgetc() macro */
+    struct gzFile_s x;      /* "x" for exposed */
+                            /* x.have: number of bytes available at x.next */
+                            /* x.next: next output data to deliver or write */
+                            /* x.pos: current position in uncompressed data */
+        /* used for both reading and writing */
+    int mode;               /* see gzip modes above */
+    int fd;                 /* file descriptor */
+    char *path;             /* path or fd for error messages */
+    unsigned size;          /* buffer size, zero if not allocated yet */
+    unsigned want;          /* requested buffer size, default is GZBUFSIZE */
+    unsigned char *in;      /* input buffer (double-sized when writing) */
+    unsigned char *out;     /* output buffer (double-sized when reading) */
+    int direct;             /* 0 if processing gzip, 1 if transparent */
+        /* just for reading */
+    int how;                /* 0: get header, 1: copy, 2: decompress */
+    z_off64_t start;        /* where the gzip data started, for rewinding */
+    int eof;                /* true if end of input file reached */
+    int past;               /* true if read requested past end */
+        /* just for writing */
+    int level;              /* compression level */
+    int strategy;           /* compression strategy */
+        /* seek request */
+    z_off64_t skip;         /* amount to skip (already rewound if backwards) */
+    int seek;               /* true if seek request pending */
+        /* error information */
+    int err;                /* error code */
+    char *msg;              /* error message */
+        /* zlib inflate or deflate stream */
+    z_stream strm;          /* stream structure in-place (not a pointer) */
+} gz_state;
+typedef gz_state FAR *gz_statep;
+
+/* shared functions */
+void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *));
+#if defined UNDER_CE
+char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error));
+#endif
+
+/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
+   value -- needed when comparing unsigned to z_off64_t, which is signed
+   (possible z_off64_t types off_t, off64_t, and long are all signed) */
+#ifdef INT_MAX
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
+#else
+unsigned ZLIB_INTERNAL gz_intmax OF((void));
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
+#endif
diff --git a/deps/SZ/zlib/gzlib.c b/deps/SZ/zlib/gzlib.c
new file mode 100644
index 0000000000000000000000000000000000000000..4105e6aff92594fb9cfa557aa8349cea5a5d4a2b
--- /dev/null
+++ b/deps/SZ/zlib/gzlib.c
@@ -0,0 +1,637 @@
+/* gzlib.c -- zlib functions common to reading and writing gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "gzguts.h"
+
+#if defined(_WIN32) && !defined(__BORLANDC__) && !defined(__MINGW32__)
+#  define LSEEK _lseeki64
+#else
+#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
+#  define LSEEK lseek64
+#else
+#  define LSEEK lseek
+#endif
+#endif
+
+/* Local functions */
+local void gz_reset OF((gz_statep));
+local gzFile gz_open OF((const void *, int, const char *));
+
+#if defined UNDER_CE
+
+/* Map the Windows error number in ERROR to a locale-dependent error message
+   string and return a pointer to it.  Typically, the values for ERROR come
+   from GetLastError.
+
+   The string pointed to shall not be modified by the application, but may be
+   overwritten by a subsequent call to gz_strwinerror
+
+   The gz_strwinerror function does not change the current setting of
+   GetLastError. */
+char ZLIB_INTERNAL *gz_strwinerror (error)
+     DWORD error;
+{
+    static char buf[1024];
+
+    wchar_t *msgbuf;
+    DWORD lasterr = GetLastError();
+    DWORD chars = FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM
+        | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+        NULL,
+        error,
+        0, /* Default language */
+        (LPVOID)&msgbuf,
+        0,
+        NULL);
+    if (chars != 0) {
+        /* If there is an \r\n appended, zap it.  */
+        if (chars >= 2
+            && msgbuf[chars - 2] == '\r' && msgbuf[chars - 1] == '\n') {
+            chars -= 2;
+            msgbuf[chars] = 0;
+        }
+
+        if (chars > sizeof (buf) - 1) {
+            chars = sizeof (buf) - 1;
+            msgbuf[chars] = 0;
+        }
+
+        wcstombs(buf, msgbuf, chars + 1);
+        LocalFree(msgbuf);
+    }
+    else {
+        sprintf(buf, "unknown win32 error (%ld)", error);
+    }
+
+    SetLastError(lasterr);
+    return buf;
+}
+
+#endif /* UNDER_CE */
+
+/* Reset gzip file state */
+local void gz_reset(state)
+    gz_statep state;
+{
+    state->x.have = 0;              /* no output data available */
+    if (state->mode == GZ_READ) {   /* for reading ... */
+        state->eof = 0;             /* not at end of file */
+        state->past = 0;            /* have not read past end yet */
+        state->how = LOOK;          /* look for gzip header */
+    }
+    state->seek = 0;                /* no seek request pending */
+    gz_error(state, Z_OK, NULL);    /* clear error */
+    state->x.pos = 0;               /* no uncompressed data yet */
+    state->strm.avail_in = 0;       /* no input data yet */
+}
+
+/* Open a gzip file either by name or file descriptor. */
+local gzFile gz_open(path, fd, mode)
+    const void *path;
+    int fd;
+    const char *mode;
+{
+    gz_statep state;
+    z_size_t len;
+    int oflag;
+#ifdef O_CLOEXEC
+    int cloexec = 0;
+#endif
+#ifdef O_EXCL
+    int exclusive = 0;
+#endif
+
+    /* check input */
+    if (path == NULL)
+        return NULL;
+
+    /* allocate gzFile structure to return */
+    state = (gz_statep)malloc(sizeof(gz_state));
+    if (state == NULL)
+        return NULL;
+    state->size = 0;            /* no buffers allocated yet */
+    state->want = GZBUFSIZE;    /* requested buffer size */
+    state->msg = NULL;          /* no error message yet */
+
+    /* interpret mode */
+    state->mode = GZ_NONE;
+    state->level = Z_DEFAULT_COMPRESSION;
+    state->strategy = Z_DEFAULT_STRATEGY;
+    state->direct = 0;
+    while (*mode) {
+        if (*mode >= '0' && *mode <= '9')
+            state->level = *mode - '0';
+        else
+            switch (*mode) {
+            case 'r':
+                state->mode = GZ_READ;
+                break;
+#ifndef NO_GZCOMPRESS
+            case 'w':
+                state->mode = GZ_WRITE;
+                break;
+            case 'a':
+                state->mode = GZ_APPEND;
+                break;
+#endif
+            case '+':       /* can't read and write at the same time */
+                free(state);
+                return NULL;
+            case 'b':       /* ignore -- will request binary anyway */
+                break;
+#ifdef O_CLOEXEC
+            case 'e':
+                cloexec = 1;
+                break;
+#endif
+#ifdef O_EXCL
+            case 'x':
+                exclusive = 1;
+                break;
+#endif
+            case 'f':
+                state->strategy = Z_FILTERED;
+                break;
+            case 'h':
+                state->strategy = Z_HUFFMAN_ONLY;
+                break;
+            case 'R':
+                state->strategy = Z_RLE;
+                break;
+            case 'F':
+                state->strategy = Z_FIXED;
+                break;
+            case 'T':
+                state->direct = 1;
+                break;
+            default:        /* could consider as an error, but just ignore */
+                ;
+            }
+        mode++;
+    }
+
+    /* must provide an "r", "w", or "a" */
+    if (state->mode == GZ_NONE) {
+        free(state);
+        return NULL;
+    }
+
+    /* can't force transparent read */
+    if (state->mode == GZ_READ) {
+        if (state->direct) {
+            free(state);
+            return NULL;
+        }
+        state->direct = 1;      /* for empty file */
+    }
+
+    /* save the path name for error messages */
+#ifdef WIDECHAR
+    if (fd == -2) {
+        len = wcstombs(NULL, path, 0);
+        if (len == (z_size_t)-1)
+            len = 0;
+    }
+    else
+#endif
+        len = strlen((const char *)path);
+    state->path = (char *)malloc(len + 1);
+    if (state->path == NULL) {
+        free(state);
+        return NULL;
+    }
+#ifdef WIDECHAR
+    if (fd == -2)
+        if (len)
+            wcstombs(state->path, path, len + 1);
+        else
+            *(state->path) = 0;
+    else
+#endif
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+        (void)snprintf(state->path, len + 1, "%s", (const char *)path);
+#else
+        strcpy(state->path, path);
+#endif
+
+    /* compute the flags for open() */
+    oflag =
+#ifdef O_LARGEFILE
+        O_LARGEFILE |
+#endif
+#ifdef O_BINARY
+        O_BINARY |
+#endif
+#ifdef O_CLOEXEC
+        (cloexec ? O_CLOEXEC : 0) |
+#endif
+        (state->mode == GZ_READ ?
+         O_RDONLY :
+         (O_WRONLY | O_CREAT |
+#ifdef O_EXCL
+          (exclusive ? O_EXCL : 0) |
+#endif
+          (state->mode == GZ_WRITE ?
+           O_TRUNC :
+           O_APPEND)));
+
+    /* open the file with the appropriate flags (or just use fd) */
+    state->fd = fd > -1 ? fd : (
+#ifdef WIDECHAR
+        fd == -2 ? _wopen(path, oflag, 0666) :
+#endif
+        open((const char *)path, oflag, 0666));
+    if (state->fd == -1) {
+        free(state->path);
+        free(state);
+        return NULL;
+    }
+    if (state->mode == GZ_APPEND) {
+        LSEEK(state->fd, 0, SEEK_END);  /* so gzoffset() is correct */
+        state->mode = GZ_WRITE;         /* simplify later checks */
+    }
+
+    /* save the current position for rewinding (only if reading) */
+    if (state->mode == GZ_READ) {
+        state->start = LSEEK(state->fd, 0, SEEK_CUR);
+        if (state->start == -1) state->start = 0;
+    }
+
+    /* initialize stream */
+    gz_reset(state);
+
+    /* return stream */
+    return (gzFile)state;
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzopen(path, mode)
+    const char *path;
+    const char *mode;
+{
+    return gz_open(path, -1, mode);
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzopen64(path, mode)
+    const char *path;
+    const char *mode;
+{
+    return gz_open(path, -1, mode);
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzdopen(fd, mode)
+    int fd;
+    const char *mode;
+{
+    char *path;         /* identifier for error messages */
+    gzFile gz;
+
+    if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
+        return NULL;
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+    (void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd);
+#else
+    sprintf(path, "<fd:%d>", fd);   /* for debugging */
+#endif
+    gz = gz_open(path, fd, mode);
+    free(path);
+    return gz;
+}
+
+/* -- see zlib.h -- */
+#ifdef WIDECHAR
+gzFile ZEXPORT gzopen_w(path, mode)
+    const wchar_t *path;
+    const char *mode;
+{
+    return gz_open(path, -2, mode);
+}
+#endif
+
+/* -- see zlib.h -- */
+int ZEXPORT gzbuffer(file, size)
+    gzFile file;
+    unsigned size;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* make sure we haven't already allocated memory */
+    if (state->size != 0)
+        return -1;
+
+    /* check and set requested size */
+    if ((size << 1) < size)
+        return -1;              /* need to be able to double it */
+    if (size < 2)
+        size = 2;               /* need two bytes to check magic header */
+    state->want = size;
+    return 0;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzrewind(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* back up and start over */
+    if (LSEEK(state->fd, state->start, SEEK_SET) == -1)
+        return -1;
+    gz_reset(state);
+    return 0;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gzseek64(file, offset, whence)
+    gzFile file;
+    z_off64_t offset;
+    int whence;
+{
+    unsigned n;
+    z_off64_t ret;
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* check that there's no error */
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* can only seek from start or relative to current position */
+    if (whence != SEEK_SET && whence != SEEK_CUR)
+        return -1;
+
+    /* normalize offset to a SEEK_CUR specification */
+    if (whence == SEEK_SET)
+        offset -= state->x.pos;
+    else if (state->seek)
+        offset += state->skip;
+    state->seek = 0;
+
+    /* if within raw area while reading, just go there */
+    if (state->mode == GZ_READ && state->how == COPY &&
+            state->x.pos + offset >= 0) {
+        ret = LSEEK(state->fd, offset - state->x.have, SEEK_CUR);
+        if (ret == -1)
+            return -1;
+        state->x.have = 0;
+        state->eof = 0;
+        state->past = 0;
+        state->seek = 0;
+        gz_error(state, Z_OK, NULL);
+        state->strm.avail_in = 0;
+        state->x.pos += offset;
+        return state->x.pos;
+    }
+
+    /* calculate skip amount, rewinding if needed for back seek when reading */
+    if (offset < 0) {
+        if (state->mode != GZ_READ)         /* writing -- can't go backwards */
+            return -1;
+        offset += state->x.pos;
+        if (offset < 0)                     /* before start of file! */
+            return -1;
+        if (gzrewind(file) == -1)           /* rewind, then skip to offset */
+            return -1;
+    }
+
+    /* if reading, skip what's in output buffer (one less gzgetc() check) */
+    if (state->mode == GZ_READ) {
+        n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ?
+            (unsigned)offset : state->x.have;
+        state->x.have -= n;
+        state->x.next += n;
+        state->x.pos += n;
+        offset -= n;
+    }
+
+    /* request skip (if not zero) */
+    if (offset) {
+        state->seek = 1;
+        state->skip = offset;
+    }
+    return state->x.pos + offset;
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gzseek(file, offset, whence)
+    gzFile file;
+    z_off_t offset;
+    int whence;
+{
+    z_off64_t ret;
+
+    ret = gzseek64(file, (z_off64_t)offset, whence);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gztell64(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* return position */
+    return state->x.pos + (state->seek ? state->skip : 0);
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gztell(file)
+    gzFile file;
+{
+    z_off64_t ret;
+
+    ret = gztell64(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gzoffset64(file)
+    gzFile file;
+{
+    z_off64_t offset;
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* compute and return effective offset in file */
+    offset = LSEEK(state->fd, 0, SEEK_CUR);
+    if (offset == -1)
+        return -1;
+    if (state->mode == GZ_READ)             /* reading */
+        offset -= state->strm.avail_in;     /* don't count buffered input */
+    return offset;
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gzoffset(file)
+    gzFile file;
+{
+    z_off64_t ret;
+
+    ret = gzoffset64(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzeof(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return 0;
+
+    /* return end-of-file state */
+    return state->mode == GZ_READ ? state->past : 0;
+}
+
+/* -- see zlib.h -- */
+const char * ZEXPORT gzerror(file, errnum)
+    gzFile file;
+    int *errnum;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return NULL;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return NULL;
+
+    /* return error information */
+    if (errnum != NULL)
+        *errnum = state->err;
+    return state->err == Z_MEM_ERROR ? "out of memory" :
+                                       (state->msg == NULL ? "" : state->msg);
+}
+
+/* -- see zlib.h -- */
+void ZEXPORT gzclearerr(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return;
+    state = (gz_statep)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return;
+
+    /* clear error and end-of-file */
+    if (state->mode == GZ_READ) {
+        state->eof = 0;
+        state->past = 0;
+    }
+    gz_error(state, Z_OK, NULL);
+}
+
+/* Create an error message in allocated memory and set state->err and
+   state->msg accordingly.  Free any previous error message already there.  Do
+   not try to free or allocate space if the error is Z_MEM_ERROR (out of
+   memory).  Simply save the error message as a static string.  If there is an
+   allocation failure constructing the error message, then convert the error to
+   out of memory. */
+void ZLIB_INTERNAL gz_error(state, err, msg)
+    gz_statep state;
+    int err;
+    const char *msg;
+{
+    /* free previously allocated message and clear */
+    if (state->msg != NULL) {
+        if (state->err != Z_MEM_ERROR)
+            free(state->msg);
+        state->msg = NULL;
+    }
+
+    /* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */
+    if (err != Z_OK && err != Z_BUF_ERROR)
+        state->x.have = 0;
+
+    /* set error code, and if no message, then done */
+    state->err = err;
+    if (msg == NULL)
+        return;
+
+    /* for an out of memory error, return literal string when requested */
+    if (err == Z_MEM_ERROR)
+        return;
+
+    /* construct error message with path */
+    if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) ==
+            NULL) {
+        state->err = Z_MEM_ERROR;
+        return;
+    }
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+    (void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3,
+                   "%s%s%s", state->path, ": ", msg);
+#else
+    strcpy(state->msg, state->path);
+    strcat(state->msg, ": ");
+    strcat(state->msg, msg);
+#endif
+}
+
+#ifndef INT_MAX
+/* portably return maximum value for an int (when limits.h presumed not
+   available) -- we need to do this to cover cases where 2's complement not
+   used, since C standard permits 1's complement and sign-bit representations,
+   otherwise we could just use ((unsigned)-1) >> 1 */
+unsigned ZLIB_INTERNAL gz_intmax()
+{
+    unsigned p, q;
+
+    p = 1;
+    do {
+        q = p;
+        p <<= 1;
+        p++;
+    } while (p > q);
+    return q >> 1;
+}
+#endif
diff --git a/deps/SZ/zlib/gzread.c b/deps/SZ/zlib/gzread.c
new file mode 100644
index 0000000000000000000000000000000000000000..956b91ea7d9e2a7cd554f7d6561142509b655244
--- /dev/null
+++ b/deps/SZ/zlib/gzread.c
@@ -0,0 +1,654 @@
+/* gzread.c -- zlib functions for reading gzip files
+ * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "gzguts.h"
+
+/* Local functions */
+local int gz_load OF((gz_statep, unsigned char *, unsigned, unsigned *));
+local int gz_avail OF((gz_statep));
+local int gz_look OF((gz_statep));
+local int gz_decomp OF((gz_statep));
+local int gz_fetch OF((gz_statep));
+local int gz_skip OF((gz_statep, z_off64_t));
+local z_size_t gz_read OF((gz_statep, voidp, z_size_t));
+
+/* Use read() to load a buffer -- return -1 on error, otherwise 0.  Read from
+   state->fd, and update state->eof, state->err, and state->msg as appropriate.
+   This function needs to loop on read(), since read() is not guaranteed to
+   read the number of bytes requested, depending on the type of descriptor. */
+local int gz_load(state, buf, len, have)
+    gz_statep state;
+    unsigned char *buf;
+    unsigned len;
+    unsigned *have;
+{
+    int ret;
+    unsigned get, max = ((unsigned)-1 >> 2) + 1;
+
+    *have = 0;
+    do {
+        get = len - *have;
+        if (get > max)
+            get = max;
+        ret = read(state->fd, buf + *have, get);
+        if (ret <= 0)
+            break;
+        *have += (unsigned)ret;
+    } while (*have < len);
+    if (ret < 0) {
+        gz_error(state, Z_ERRNO, zstrerror());
+        return -1;
+    }
+    if (ret == 0)
+        state->eof = 1;
+    return 0;
+}
+
+/* Load up input buffer and set eof flag if last data loaded -- return -1 on
+   error, 0 otherwise.  Note that the eof flag is set when the end of the input
+   file is reached, even though there may be unused data in the buffer.  Once
+   that data has been used, no more attempts will be made to read the file.
+   If strm->avail_in != 0, then the current data is moved to the beginning of
+   the input buffer, and then the remainder of the buffer is loaded with the
+   available data from the input file. */
+local int gz_avail(state)
+    gz_statep state;
+{
+    unsigned got;
+    z_streamp strm = &(state->strm);
+
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+    if (state->eof == 0) {
+        if (strm->avail_in) {       /* copy what's there to the start */
+            unsigned char *p = state->in;
+            unsigned const char *q = strm->next_in;
+            unsigned n = strm->avail_in;
+            do {
+                *p++ = *q++;
+            } while (--n);
+        }
+        if (gz_load(state, state->in + strm->avail_in,
+                    state->size - strm->avail_in, &got) == -1)
+            return -1;
+        strm->avail_in += got;
+        strm->next_in = state->in;
+    }
+    return 0;
+}
+
+/* Look for gzip header, set up for inflate or copy.  state->x.have must be 0.
+   If this is the first time in, allocate required memory.  state->how will be
+   left unchanged if there is no more input data available, will be set to COPY
+   if there is no gzip header and direct copying will be performed, or it will
+   be set to GZIP for decompression.  If direct copying, then leftover input
+   data from the input buffer will be copied to the output buffer.  In that
+   case, all further file reads will be directly to either the output buffer or
+   a user buffer.  If decompressing, the inflate state will be initialized.
+   gz_look() will return 0 on success or -1 on failure. */
+local int gz_look(state)
+    gz_statep state;
+{
+    z_streamp strm = &(state->strm);
+
+    /* allocate read buffers and inflate memory */
+    if (state->size == 0) {
+        /* allocate buffers */
+        state->in = (unsigned char *)malloc(state->want);
+        state->out = (unsigned char *)malloc(state->want << 1);
+        if (state->in == NULL || state->out == NULL) {
+            free(state->out);
+            free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        state->size = state->want;
+
+        /* allocate inflate memory */
+        state->strm.zalloc = Z_NULL;
+        state->strm.zfree = Z_NULL;
+        state->strm.opaque = Z_NULL;
+        state->strm.avail_in = 0;
+        state->strm.next_in = Z_NULL;
+        if (inflateInit2(&(state->strm), 15 + 16) != Z_OK) {    /* gunzip */
+            free(state->out);
+            free(state->in);
+            state->size = 0;
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+    }
+
+    /* get at least the magic bytes in the input buffer */
+    if (strm->avail_in < 2) {
+        if (gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0)
+            return 0;
+    }
+
+    /* look for gzip magic bytes -- if there, do gzip decoding (note: there is
+       a logical dilemma here when considering the case of a partially written
+       gzip file, to wit, if a single 31 byte is written, then we cannot tell
+       whether this is a single-byte file, or just a partially written gzip
+       file -- for here we assume that if a gzip file is being written, then
+       the header will be written in a single operation, so that reading a
+       single byte is sufficient indication that it is not a gzip file) */
+    if (strm->avail_in > 1 &&
+            strm->next_in[0] == 31 && strm->next_in[1] == 139) {
+        inflateReset(strm);
+        state->how = GZIP;
+        state->direct = 0;
+        return 0;
+    }
+
+    /* no gzip header -- if we were decoding gzip before, then this is trailing
+       garbage.  Ignore the trailing garbage and finish. */
+    if (state->direct == 0) {
+        strm->avail_in = 0;
+        state->eof = 1;
+        state->x.have = 0;
+        return 0;
+    }
+
+    /* doing raw i/o, copy any leftover input to output -- this assumes that
+       the output buffer is larger than the input buffer, which also assures
+       space for gzungetc() */
+    state->x.next = state->out;
+    if (strm->avail_in) {
+        memcpy(state->x.next, strm->next_in, strm->avail_in);
+        state->x.have = strm->avail_in;
+        strm->avail_in = 0;
+    }
+    state->how = COPY;
+    state->direct = 1;
+    return 0;
+}
+
+/* Decompress from input to the provided next_out and avail_out in the state.
+   On return, state->x.have and state->x.next point to the just decompressed
+   data.  If the gzip stream completes, state->how is reset to LOOK to look for
+   the next gzip stream or raw data, once state->x.have is depleted.  Returns 0
+   on success, -1 on failure. */
+local int gz_decomp(state)
+    gz_statep state;
+{
+    int ret = Z_OK;
+    unsigned had;
+    z_streamp strm = &(state->strm);
+
+    /* fill output buffer up to end of deflate stream */
+    had = strm->avail_out;
+    do {
+        /* get more input for inflate() */
+        if (strm->avail_in == 0 && gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0) {
+            gz_error(state, Z_BUF_ERROR, "unexpected end of file");
+            break;
+        }
+
+        /* decompress and handle errors */
+        ret = inflate(strm, Z_NO_FLUSH);
+        if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
+            gz_error(state, Z_STREAM_ERROR,
+                     "internal error: inflate stream corrupt");
+            return -1;
+        }
+        if (ret == Z_MEM_ERROR) {
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        if (ret == Z_DATA_ERROR) {              /* deflate stream invalid */
+            gz_error(state, Z_DATA_ERROR,
+                     strm->msg == NULL ? "compressed data error" : strm->msg);
+            return -1;
+        }
+    } while (strm->avail_out && ret != Z_STREAM_END);
+
+    /* update available output */
+    state->x.have = had - strm->avail_out;
+    state->x.next = strm->next_out - state->x.have;
+
+    /* if the gzip stream completed successfully, look for another */
+    if (ret == Z_STREAM_END)
+        state->how = LOOK;
+
+    /* good decompression */
+    return 0;
+}
+
+/* Fetch data and put it in the output buffer.  Assumes state->x.have is 0.
+   Data is either copied from the input file or decompressed from the input
+   file depending on state->how.  If state->how is LOOK, then a gzip header is
+   looked for to determine whether to copy or decompress.  Returns -1 on error,
+   otherwise 0.  gz_fetch() will leave state->how as COPY or GZIP unless the
+   end of the input file has been reached and all data has been processed.  */
+local int gz_fetch(state)
+    gz_statep state;
+{
+    z_streamp strm = &(state->strm);
+
+    do {
+        switch(state->how) {
+        case LOOK:      /* -> LOOK, COPY (only if never GZIP), or GZIP */
+            if (gz_look(state) == -1)
+                return -1;
+            if (state->how == LOOK)
+                return 0;
+            break;
+        case COPY:      /* -> COPY */
+            if (gz_load(state, state->out, state->size << 1, &(state->x.have))
+                    == -1)
+                return -1;
+            state->x.next = state->out;
+            return 0;
+        case GZIP:      /* -> GZIP or LOOK (if end of gzip stream) */
+            strm->avail_out = state->size << 1;
+            strm->next_out = state->out;
+            if (gz_decomp(state) == -1)
+                return -1;
+        }
+    } while (state->x.have == 0 && (!state->eof || strm->avail_in));
+    return 0;
+}
+
+/* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
+local int gz_skip(state, len)
+    gz_statep state;
+    z_off64_t len;
+{
+    unsigned n;
+
+    /* skip over len bytes or reach end-of-file, whichever comes first */
+    while (len)
+        /* skip over whatever is in output buffer */
+        if (state->x.have) {
+            n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ?
+                (unsigned)len : state->x.have;
+            state->x.have -= n;
+            state->x.next += n;
+            state->x.pos += n;
+            len -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state->eof && state->strm.avail_in == 0)
+            break;
+
+        /* need more data to skip -- load up output buffer */
+        else {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return -1;
+        }
+    return 0;
+}
+
+/* Read len bytes into buf from file, or less than len up to the end of the
+   input.  Return the number of bytes read.  If zero is returned, either the
+   end of file was reached, or there was an error.  state->err must be
+   consulted in that case to determine which. */
+local z_size_t gz_read(state, buf, len)
+    gz_statep state;
+    voidp buf;
+    z_size_t len;
+{
+    z_size_t got;
+    unsigned n;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return 0;
+    }
+
+    /* get len bytes to buf, or less than len if at the end */
+    got = 0;
+    do {
+        /* set n to the maximum amount of len that fits in an unsigned int */
+        n = -1;
+        if (n > len)
+            n = len;
+
+        /* first just try copying data from the output buffer */
+        if (state->x.have) {
+            if (state->x.have < n)
+                n = state->x.have;
+            memcpy(buf, state->x.next, n);
+            state->x.next += n;
+            state->x.have -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state->eof && state->strm.avail_in == 0) {
+            state->past = 1;        /* tried to read past end */
+            break;
+        }
+
+        /* need output data -- for small len or new stream load up our output
+           buffer */
+        else if (state->how == LOOK || n < (state->size << 1)) {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return 0;
+            continue;       /* no progress yet -- go back to copy above */
+            /* the copy above assures that we will leave with space in the
+               output buffer, allowing at least one gzungetc() to succeed */
+        }
+
+        /* large len -- read directly into user buffer */
+        else if (state->how == COPY) {      /* read directly */
+            if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
+                return 0;
+        }
+
+        /* large len -- decompress directly into user buffer */
+        else {  /* state->how == GZIP */
+            state->strm.avail_out = n;
+            state->strm.next_out = (unsigned char *)buf;
+            if (gz_decomp(state) == -1)
+                return 0;
+            n = state->x.have;
+            state->x.have = 0;
+        }
+
+        /* update progress */
+        len -= n;
+        buf = (char *)buf + n;
+        got += n;
+        state->x.pos += n;
+    } while (len);
+
+    /* return number of bytes read into user buffer */
+    return got;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzread(file, buf, len)
+    gzFile file;
+    voidp buf;
+    unsigned len;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
+        return -1;
+    }
+
+    /* read len or fewer bytes to buf */
+    len = gz_read(state, buf, len);
+
+    /* check for an error */
+    if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* return the number of bytes read (this is assured to fit in an int) */
+    return (int)len;
+}
+
+/* -- see zlib.h -- */
+z_size_t ZEXPORT gzfread(buf, size, nitems, file)
+    voidp buf;
+    z_size_t size;
+    z_size_t nitems;
+    gzFile file;
+{
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    len = nitems * size;
+    if (size && len / size != nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+
+    /* read len or fewer bytes to buf, return the number of full items read */
+    return len ? gz_read(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#else
+#  undef gzgetc
+#endif
+int ZEXPORT gzgetc(file)
+    gzFile file;
+{
+    int ret;
+    unsigned char buf[1];
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+        (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* try output buffer (no need to check for skip request) */
+    if (state->x.have) {
+        state->x.have--;
+        state->x.pos++;
+        return *(state->x.next)++;
+    }
+
+    /* nothing there -- try gz_read() */
+    ret = gz_read(state, buf, 1);
+    return ret < 1 ? -1 : buf[0];
+}
+
+int ZEXPORT gzgetc_(file)
+gzFile file;
+{
+    return gzgetc(file);
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzungetc(c, file)
+    int c;
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+        (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return -1;
+    }
+
+    /* can't push EOF */
+    if (c < 0)
+        return -1;
+
+    /* if output buffer empty, put byte at end (allows more pushing) */
+    if (state->x.have == 0) {
+        state->x.have = 1;
+        state->x.next = state->out + (state->size << 1) - 1;
+        state->x.next[0] = (unsigned char)c;
+        state->x.pos--;
+        state->past = 0;
+        return c;
+    }
+
+    /* if no room, give up (must have already done a gzungetc()) */
+    if (state->x.have == (state->size << 1)) {
+        gz_error(state, Z_DATA_ERROR, "out of room to push characters");
+        return -1;
+    }
+
+    /* slide output data if needed and insert byte before existing data */
+    if (state->x.next == state->out) {
+        unsigned char *src = state->out + state->x.have;
+        unsigned char *dest = state->out + (state->size << 1);
+        while (src > state->out)
+            *--dest = *--src;
+        state->x.next = dest;
+    }
+    state->x.have++;
+    state->x.next--;
+    state->x.next[0] = (unsigned char)c;
+    state->x.pos--;
+    state->past = 0;
+    return c;
+}
+
+/* -- see zlib.h -- */
+char * ZEXPORT gzgets(file, buf, len)
+    gzFile file;
+    char *buf;
+    int len;
+{
+    unsigned left, n;
+    char *str;
+    unsigned char *eol;
+    gz_statep state;
+
+    /* check parameters and get internal structure */
+    if (file == NULL || buf == NULL || len < 1)
+        return NULL;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+        (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return NULL;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return NULL;
+    }
+
+    /* copy output bytes up to new line or len - 1, whichever comes first --
+       append a terminating zero to the string (we don't check for a zero in
+       the contents, let the user worry about that) */
+    str = buf;
+    left = (unsigned)len - 1;
+    if (left) do {
+        /* assure that something is in the output buffer */
+        if (state->x.have == 0 && gz_fetch(state) == -1)
+            return NULL;                /* error */
+        if (state->x.have == 0) {       /* end of file */
+            state->past = 1;            /* read past end */
+            break;                      /* return what we have */
+        }
+
+        /* look for end-of-line in current output buffer */
+        n = state->x.have > left ? left : state->x.have;
+        eol = (unsigned char *)memchr(state->x.next, '\n', n);
+        if (eol != NULL)
+            n = (unsigned)(eol - state->x.next) + 1;
+
+        /* copy through end-of-line, or remainder if not found */
+        memcpy(buf, state->x.next, n);
+        state->x.have -= n;
+        state->x.next += n;
+        state->x.pos += n;
+        left -= n;
+        buf += n;
+    } while (left && eol == NULL);
+
+    /* return terminated string, or if nothing, end of file */
+    if (buf == str)
+        return NULL;
+    buf[0] = 0;
+    return str;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzdirect(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* if the state is not known, but we can find out, then do so (this is
+       mainly for right after a gzopen() or gzdopen()) */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
+    /* return 1 if transparent, 0 if processing a gzip stream */
+    return state->direct;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzclose_r(file)
+    gzFile file;
+{
+    int ret, err;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're reading */
+    if (state->mode != GZ_READ)
+        return Z_STREAM_ERROR;
+
+    /* free memory and close file */
+    if (state->size) {
+        inflateEnd(&(state->strm));
+        free(state->out);
+        free(state->in);
+    }
+    err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
+    gz_error(state, Z_OK, NULL);
+    free(state->path);
+    ret = close(state->fd);
+    free(state);
+    return ret ? Z_ERRNO : err;
+}
diff --git a/deps/SZ/zlib/gzwrite.c b/deps/SZ/zlib/gzwrite.c
new file mode 100644
index 0000000000000000000000000000000000000000..c7b5651d70b994e20222a734c620f68e11e0dc84
--- /dev/null
+++ b/deps/SZ/zlib/gzwrite.c
@@ -0,0 +1,665 @@
+/* gzwrite.c -- zlib functions for writing gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "gzguts.h"
+
+/* Local functions */
+local int gz_init OF((gz_statep));
+local int gz_comp OF((gz_statep, int));
+local int gz_zero OF((gz_statep, z_off64_t));
+local z_size_t gz_write OF((gz_statep, voidpc, z_size_t));
+
+/* Initialize state for writing a gzip file.  Mark initialization by setting
+   state->size to non-zero.  Return -1 on a memory allocation failure, or 0 on
+   success. */
+local int gz_init(state)
+    gz_statep state;
+{
+    int ret;
+    z_streamp strm = &(state->strm);
+
+    /* allocate input buffer (double size for gzprintf) */
+    state->in = (unsigned char *)malloc(state->want << 1);
+    if (state->in == NULL) {
+        gz_error(state, Z_MEM_ERROR, "out of memory");
+        return -1;
+    }
+
+    /* only need output buffer and deflate state if compressing */
+    if (!state->direct) {
+        /* allocate output buffer */
+        state->out = (unsigned char *)malloc(state->want);
+        if (state->out == NULL) {
+            free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+
+        /* allocate deflate memory, set up for gzip compression */
+        strm->zalloc = Z_NULL;
+        strm->zfree = Z_NULL;
+        strm->opaque = Z_NULL;
+        ret = deflateInit2(strm, state->level, Z_DEFLATED,
+                           MAX_WBITS + 16, DEF_MEM_LEVEL, state->strategy);
+        if (ret != Z_OK) {
+            free(state->out);
+            free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        strm->next_in = NULL;
+    }
+
+    /* mark state as initialized */
+    state->size = state->want;
+
+    /* initialize write buffer if compressing */
+    if (!state->direct) {
+        strm->avail_out = state->size;
+        strm->next_out = state->out;
+        state->x.next = strm->next_out;
+    }
+    return 0;
+}
+
+/* Compress whatever is at avail_in and next_in and write to the output file.
+   Return -1 if there is an error writing to the output file or if gz_init()
+   fails to allocate memory, otherwise 0.  flush is assumed to be a valid
+   deflate() flush value.  If flush is Z_FINISH, then the deflate() state is
+   reset to start a new gzip stream.  If gz->direct is true, then simply write
+   to the output file without compressing, and ignore flush. */
+local int gz_comp(state, flush)
+    gz_statep state;
+    int flush;
+{
+    int ret, writ;
+    unsigned have, put, max = ((unsigned)-1 >> 2) + 1;
+    z_streamp strm = &(state->strm);
+
+    /* allocate memory if this is the first time through */
+    if (state->size == 0 && gz_init(state) == -1)
+        return -1;
+
+    /* write directly if requested */
+    if (state->direct) {
+        while (strm->avail_in) {
+            put = strm->avail_in > max ? max : strm->avail_in;
+            writ = write(state->fd, strm->next_in, put);
+            if (writ < 0) {
+                gz_error(state, Z_ERRNO, zstrerror());
+                return -1;
+            }
+            strm->avail_in -= (unsigned)writ;
+            strm->next_in += writ;
+        }
+        return 0;
+    }
+
+    /* run deflate() on provided input until it produces no more output */
+    ret = Z_OK;
+    do {
+        /* write out current buffer contents if full, or if flushing, but if
+           doing Z_FINISH then don't write until we get to Z_STREAM_END */
+        if (strm->avail_out == 0 || (flush != Z_NO_FLUSH &&
+            (flush != Z_FINISH || ret == Z_STREAM_END))) {
+            while (strm->next_out > state->x.next) {
+                put = strm->next_out - state->x.next > (int)max ? max :
+                      (unsigned)(strm->next_out - state->x.next);
+                writ = write(state->fd, state->x.next, put);
+                if (writ < 0) {
+                    gz_error(state, Z_ERRNO, zstrerror());
+                    return -1;
+                }
+                state->x.next += writ;
+            }
+            if (strm->avail_out == 0) {
+                strm->avail_out = state->size;
+                strm->next_out = state->out;
+                state->x.next = state->out;
+            }
+        }
+
+        /* compress */
+        have = strm->avail_out;
+        ret = deflate(strm, flush);
+        if (ret == Z_STREAM_ERROR) {
+            gz_error(state, Z_STREAM_ERROR,
+                      "internal error: deflate stream corrupt");
+            return -1;
+        }
+        have -= strm->avail_out;
+    } while (have);
+
+    /* if that completed a deflate stream, allow another to start */
+    if (flush == Z_FINISH)
+        deflateReset(strm);
+
+    /* all done, no errors */
+    return 0;
+}
+
+/* Compress len zeros to output.  Return -1 on a write error or memory
+   allocation failure by gz_comp(), or 0 on success. */
+local int gz_zero(state, len)
+    gz_statep state;
+    z_off64_t len;
+{
+    int first;
+    unsigned n;
+    z_streamp strm = &(state->strm);
+
+    /* consume whatever's left in the input buffer */
+    if (strm->avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+        return -1;
+
+    /* compress len zeros (len guaranteed > 0) */
+    first = 1;
+    while (len) {
+        n = GT_OFF(state->size) || (z_off64_t)state->size > len ?
+            (unsigned)len : state->size;
+        if (first) {
+            memset(state->in, 0, n);
+            first = 0;
+        }
+        strm->avail_in = n;
+        strm->next_in = state->in;
+        state->x.pos += n;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return -1;
+        len -= n;
+    }
+    return 0;
+}
+
+/* Write len bytes from buf to file.  Return the number of bytes written.  If
+   the returned value is less than len, then there was an error. */
+local z_size_t gz_write(state, buf, len)
+    gz_statep state;
+    voidpc buf;
+    z_size_t len;
+{
+    z_size_t put = len;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* allocate memory if this is the first time through */
+    if (state->size == 0 && gz_init(state) == -1)
+        return 0;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return 0;
+    }
+
+    /* for small len, copy to input buffer, otherwise compress directly */
+    if (len < state->size) {
+        /* copy to input buffer, compress when full */
+        do {
+            unsigned have, copy;
+
+            if (state->strm.avail_in == 0)
+                state->strm.next_in = state->in;
+            have = (unsigned)((state->strm.next_in + state->strm.avail_in) -
+                              state->in);
+            copy = state->size - have;
+            if (copy > len)
+                copy = len;
+            memcpy(state->in + have, buf, copy);
+            state->strm.avail_in += copy;
+            state->x.pos += copy;
+            buf = (const char *)buf + copy;
+            len -= copy;
+            if (len && gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+        } while (len);
+    }
+    else {
+        /* consume whatever's left in the input buffer */
+        if (state->strm.avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+            return 0;
+
+        /* directly compress user buffer to file */
+        state->strm.next_in = (z_const Bytef *)buf;
+        do {
+            unsigned n = (unsigned)-1;
+            if (n > len)
+                n = len;
+            state->strm.avail_in = n;
+            state->x.pos += n;
+            if (gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+            len -= n;
+        } while (len);
+    }
+
+    /* input was all buffered or compressed */
+    return put;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzwrite(file, buf, len)
+    gzFile file;
+    voidpc buf;
+    unsigned len;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return 0;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_DATA_ERROR, "requested length does not fit in int");
+        return 0;
+    }
+
+    /* write len bytes from buf (the return value will fit in an int) */
+    return (int)gz_write(state, buf, len);
+}
+
+/* -- see zlib.h -- */
+z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
+    voidpc buf;
+    z_size_t size;
+    z_size_t nitems;
+    gzFile file;
+{
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    len = nitems * size;
+    if (size && len / size != nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+
+    /* write len bytes to buf, return the number of full items written */
+    return len ? gz_write(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzputc(file, c)
+    gzFile file;
+    int c;
+{
+    unsigned have;
+    unsigned char buf[1];
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return -1;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return -1;
+    }
+
+    /* try writing to input buffer for speed (state->size == 0 if buffer not
+       initialized) */
+    if (state->size) {
+        if (strm->avail_in == 0)
+            strm->next_in = state->in;
+        have = (unsigned)((strm->next_in + strm->avail_in) - state->in);
+        if (have < state->size) {
+            state->in[have] = (unsigned char)c;
+            strm->avail_in++;
+            state->x.pos++;
+            return c & 0xff;
+        }
+    }
+
+    /* no room in buffer or not initialized, use gz_write() */
+    buf[0] = (unsigned char)c;
+    if (gz_write(state, buf, 1) != 1)
+        return -1;
+    return c & 0xff;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzputs(file, str)
+    gzFile file;
+    const char *str;
+{
+    int ret;
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return -1;
+
+    /* write string */
+    len = strlen(str);
+    ret = gz_write(state, str, len);
+    return ret == 0 && len != 0 ? -1 : ret;
+}
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#include <stdarg.h>
+
+/* -- see zlib.h -- */
+int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
+{
+    int len;
+    unsigned left;
+    char *next;
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* make sure we have some buffer space */
+    if (state->size == 0 && gz_init(state) == -1)
+        return state->err;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* do the printf() into the input buffer, put length in len -- the input
+       buffer is double-sized just for this function, so there is guaranteed to
+       be state->size bytes available after the current contents */
+    if (strm->avail_in == 0)
+        strm->next_in = state->in;
+    next = (char *)(state->in + (strm->next_in - state->in) + strm->avail_in);
+    next[state->size - 1] = 0;
+#ifdef NO_vsnprintf
+#  ifdef HAS_vsprintf_void
+    (void)vsprintf(next, format, va);
+    for (len = 0; len < state->size; len++)
+        if (next[len] == 0) break;
+#  else
+    len = vsprintf(next, format, va);
+#  endif
+#else
+#  ifdef HAS_vsnprintf_void
+    (void)vsnprintf(next, state->size, format, va);
+    len = strlen(next);
+#  else
+    len = vsnprintf(next, state->size, format, va);
+#  endif
+#endif
+
+    /* check that printf() results fit in buffer */
+    if (len == 0 || (unsigned)len >= state->size || next[state->size - 1] != 0)
+        return 0;
+
+    /* update buffer and position, compress first half if past that */
+    strm->avail_in += (unsigned)len;
+    state->x.pos += len;
+    if (strm->avail_in >= state->size) {
+        left = strm->avail_in - state->size;
+        strm->avail_in = state->size;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return state->err;
+        memcpy(state->in, state->in + state->size, left);
+        strm->next_in = state->in;
+        strm->avail_in = left;
+    }
+    return len;
+}
+
+int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
+{
+    va_list va;
+    int ret;
+
+    va_start(va, format);
+    ret = gzvprintf(file, format, va);
+    va_end(va);
+    return ret;
+}
+
+#else /* !STDC && !Z_HAVE_STDARG_H */
+
+/* -- see zlib.h -- */
+int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
+                       a11, a12, a13, a14, a15, a16, a17, a18, a19, a20)
+    gzFile file;
+    const char *format;
+    int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
+        a11, a12, a13, a14, a15, a16, a17, a18, a19, a20;
+{
+    unsigned len, left;
+    char *next;
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state->strm);
+
+    /* check that can really pass pointer in ints */
+    if (sizeof(int) != sizeof(void *))
+        return Z_STREAM_ERROR;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* make sure we have some buffer space */
+    if (state->size == 0 && gz_init(state) == -1)
+        return state->error;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->error;
+    }
+
+    /* do the printf() into the input buffer, put length in len -- the input
+       buffer is double-sized just for this function, so there is guaranteed to
+       be state->size bytes available after the current contents */
+    if (strm->avail_in == 0)
+        strm->next_in = state->in;
+    next = (char *)(strm->next_in + strm->avail_in);
+    next[state->size - 1] = 0;
+#ifdef NO_snprintf
+#  ifdef HAS_sprintf_void
+    sprintf(next, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
+            a13, a14, a15, a16, a17, a18, a19, a20);
+    for (len = 0; len < size; len++)
+        if (next[len] == 0)
+            break;
+#  else
+    len = sprintf(next, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11,
+                  a12, a13, a14, a15, a16, a17, a18, a19, a20);
+#  endif
+#else
+#  ifdef HAS_snprintf_void
+    snprintf(next, state->size, format, a1, a2, a3, a4, a5, a6, a7, a8, a9,
+             a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20);
+    len = strlen(next);
+#  else
+    len = snprintf(next, state->size, format, a1, a2, a3, a4, a5, a6, a7, a8,
+                   a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20);
+#  endif
+#endif
+
+    /* check that printf() results fit in buffer */
+    if (len == 0 || len >= state->size || next[state->size - 1] != 0)
+        return 0;
+
+    /* update buffer and position, compress first half if past that */
+    strm->avail_in += len;
+    state->x.pos += len;
+    if (strm->avail_in >= state->size) {
+        left = strm->avail_in - state->size;
+        strm->avail_in = state->size;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return state->err;
+        memcpy(state->in, state->in + state->size, left);
+        strm->next_in = state->in;
+        strm->avail_in = left;
+    }
+    return (int)len;
+}
+
+#endif
+
+/* -- see zlib.h -- */
+int ZEXPORT gzflush(file, flush)
+    gzFile file;
+    int flush;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* check flush parameter */
+    if (flush < 0 || flush > Z_FINISH)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* compress remaining data with requested flush */
+    (void)gz_comp(state, flush);
+    return state->err;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzsetparams(file, level, strategy)
+    gzFile file;
+    int level;
+    int strategy;
+{
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* if no change is requested, then do nothing */
+    if (level == state->level && strategy == state->strategy)
+        return Z_OK;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* change compression parameters for subsequent input */
+    if (state->size) {
+        /* flush previous input with previous parameters before changing */
+        if (strm->avail_in && gz_comp(state, Z_BLOCK) == -1)
+            return state->err;
+        deflateParams(strm, level, strategy);
+    }
+    state->level = level;
+    state->strategy = strategy;
+    return Z_OK;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzclose_w(file)
+    gzFile file;
+{
+    int ret = Z_OK;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're writing */
+    if (state->mode != GZ_WRITE)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            ret = state->err;
+    }
+
+    /* flush, free memory, and close file */
+    if (gz_comp(state, Z_FINISH) == -1)
+        ret = state->err;
+    if (state->size) {
+        if (!state->direct) {
+            (void)deflateEnd(&(state->strm));
+            free(state->out);
+        }
+        free(state->in);
+    }
+    gz_error(state, Z_OK, NULL);
+    free(state->path);
+    if (close(state->fd) == -1)
+        ret = Z_ERRNO;
+    free(state);
+    return ret;
+}
diff --git a/deps/SZ/zlib/infback.c b/deps/SZ/zlib/infback.c
new file mode 100644
index 0000000000000000000000000000000000000000..59679ecbfc5d778ca85d9ced87565f69bcb4635c
--- /dev/null
+++ b/deps/SZ/zlib/infback.c
@@ -0,0 +1,640 @@
+/* infback.c -- inflate using a call-back interface
+ * Copyright (C) 1995-2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+   This code is largely copied from inflate.c.  Normally either infback.o or
+   inflate.o would be linked into an application--not both.  The interface
+   with inffast.c is retained so that optimized assembler-coded versions of
+   inflate_fast() can be used with either inflate.c or infback.c.
+ */
+
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+/* function prototypes */
+local void fixedtables OF((struct inflate_state FAR *state));
+
+/*
+   strm provides memory allocation functions in zalloc and zfree, or
+   Z_NULL to use the library memory allocation functions.
+
+   windowBits is in the range 8..15, and window is a user-supplied
+   window and output buffer that is 2**windowBits bytes.
+ */
+int ZEXPORT inflateBackInit_(strm, windowBits, window, version, stream_size)
+z_streamp strm;
+int windowBits;
+unsigned char FAR *window;
+const char *version;
+int stream_size;
+{
+    struct inflate_state FAR *state;
+
+    if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
+        stream_size != (int)(sizeof(z_stream)))
+        return Z_VERSION_ERROR;
+    if (strm == Z_NULL || window == Z_NULL ||
+        windowBits < 8 || windowBits > 15)
+        return Z_STREAM_ERROR;
+    strm->msg = Z_NULL;                 /* in case we return an error */
+    if (strm->zalloc == (alloc_func)0) {
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zalloc = zcalloc;
+        strm->opaque = (voidpf)0;
+#endif
+    }
+    if (strm->zfree == (free_func)0)
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+    strm->zfree = zcfree;
+#endif
+    state = (struct inflate_state FAR *)ZALLOC(strm, 1,
+                                               sizeof(struct inflate_state));
+    if (state == Z_NULL) return Z_MEM_ERROR;
+    Tracev((stderr, "inflate: allocated\n"));
+    strm->state = (struct internal_state FAR *)state;
+    state->dmax = 32768U;
+    state->wbits = (uInt)windowBits;
+    state->wsize = 1U << windowBits;
+    state->window = window;
+    state->wnext = 0;
+    state->whave = 0;
+    return Z_OK;
+}
+
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  Normally this returns fixed tables from inffixed.h.
+   If BUILDFIXED is defined, then instead this routine builds the tables the
+   first time it's called, and returns those tables the first time and
+   thereafter.  This reduces the size of the code by about 2K bytes, in
+   exchange for a little execution time.  However, BUILDFIXED should not be
+   used for threaded applications, since the rewriting of the tables and virgin
+   may not be thread-safe.
+ */
+local void fixedtables(state)
+struct inflate_state FAR *state;
+{
+#ifdef BUILDFIXED
+    static int virgin = 1;
+    static code *lenfix, *distfix;
+    static code fixed[544];
+
+    /* build fixed huffman tables if first call (may not be thread safe) */
+    if (virgin) {
+        unsigned sym, bits;
+        static code *next;
+
+        /* literal/length table */
+        sym = 0;
+        while (sym < 144) state->lens[sym++] = 8;
+        while (sym < 256) state->lens[sym++] = 9;
+        while (sym < 280) state->lens[sym++] = 7;
+        while (sym < 288) state->lens[sym++] = 8;
+        next = fixed;
+        lenfix = next;
+        bits = 9;
+        inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work);
+
+        /* distance table */
+        sym = 0;
+        while (sym < 32) state->lens[sym++] = 5;
+        distfix = next;
+        bits = 5;
+        inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work);
+
+        /* do this just once */
+        virgin = 0;
+    }
+#else /* !BUILDFIXED */
+#   include "inffixed.h"
+#endif /* BUILDFIXED */
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
+}
+
+/* Macros for inflateBack(): */
+
+/* Load returned state from inflate_fast() */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Set state from registers for inflate_fast() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Assure that some input is available.  If input is requested, but denied,
+   then return a Z_BUF_ERROR from inflateBack(). */
+#define PULL() \
+    do { \
+        if (have == 0) { \
+            have = in(in_desc, &next); \
+            if (have == 0) { \
+                next = Z_NULL; \
+                ret = Z_BUF_ERROR; \
+                goto inf_leave; \
+            } \
+        } \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflateBack()
+   with an error if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        PULL(); \
+        have--; \
+        hold += (unsigned long)(*next++) << bits; \
+        bits += 8; \
+    } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflateBack() with
+   an error. */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/* Assure that some output space is available, by writing out the window
+   if it's full.  If the write fails, return from inflateBack() with a
+   Z_BUF_ERROR. */
+#define ROOM() \
+    do { \
+        if (left == 0) { \
+            put = state->window; \
+            left = state->wsize; \
+            state->whave = left; \
+            if (out(out_desc, put, left)) { \
+                ret = Z_BUF_ERROR; \
+                goto inf_leave; \
+            } \
+        } \
+    } while (0)
+
+/*
+   strm provides the memory allocation functions and window buffer on input,
+   and provides information on the unused input on return.  For Z_DATA_ERROR
+   returns, strm will also provide an error message.
+
+   in() and out() are the call-back input and output functions.  When
+   inflateBack() needs more input, it calls in().  When inflateBack() has
+   filled the window with output, or when it completes with data in the
+   window, it calls out() to write out the data.  The application must not
+   change the provided input until in() is called again or inflateBack()
+   returns.  The application must not change the window/output buffer until
+   inflateBack() returns.
+
+   in() and out() are called with a descriptor parameter provided in the
+   inflateBack() call.  This parameter can be a structure that provides the
+   information required to do the read or write, as well as accumulated
+   information on the input and output such as totals and check values.
+
+   in() should return zero on failure.  out() should return non-zero on
+   failure.  If either in() or out() fails, than inflateBack() returns a
+   Z_BUF_ERROR.  strm->next_in can be checked for Z_NULL to see whether it
+   was in() or out() that caused in the error.  Otherwise,  inflateBack()
+   returns Z_STREAM_END on success, Z_DATA_ERROR for an deflate format
+   error, or Z_MEM_ERROR if it could not allocate memory for the state.
+   inflateBack() can also return Z_STREAM_ERROR if the input parameters
+   are not correct, i.e. strm is Z_NULL or the state was not initialized.
+ */
+int ZEXPORT inflateBack(strm, in, in_desc, out, out_desc)
+z_streamp strm;
+in_func in;
+void FAR *in_desc;
+out_func out;
+void FAR *out_desc;
+{
+    struct inflate_state FAR *state;
+    z_const unsigned char FAR *next;    /* next input */
+    unsigned char FAR *put;     /* next output */
+    unsigned have, left;        /* available input and output */
+    unsigned long hold;         /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char FAR *from;    /* where to copy match bytes from */
+    code here;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int ret;                    /* return code */
+    static const unsigned short order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    /* Check that the strm exists and that the state was initialized */
+    if (strm == Z_NULL || strm->state == Z_NULL)
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* Reset the state */
+    strm->msg = Z_NULL;
+    state->mode = TYPE;
+    state->last = 0;
+    state->whave = 0;
+    next = strm->next_in;
+    have = next != Z_NULL ? strm->avail_in : 0;
+    hold = 0;
+    bits = 0;
+    put = state->window;
+    left = state->wsize;
+
+    /* Inflate until end of block marked as last */
+    for (;;)
+        switch (state->mode) {
+        case TYPE:
+            /* determine and dispatch block type */
+            if (state->last) {
+                BYTEBITS();
+                state->mode = DONE;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                Tracev((stderr, "inflate:     stored block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                fixedtables(state);
+                Tracev((stderr, "inflate:     fixed codes block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = LEN;              /* decode codes */
+                break;
+            case 2:                             /* dynamic block */
+                Tracev((stderr, "inflate:     dynamic codes block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = TABLE;
+                break;
+            case 3:
+                strm->msg = (char *)"invalid block type";
+                state->mode = BAD;
+            }
+            DROPBITS(2);
+            break;
+
+        case STORED:
+            /* get and verify stored block length */
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                strm->msg = (char *)"invalid stored block lengths";
+                state->mode = BAD;
+                break;
+            }
+            state->length = (unsigned)hold & 0xffff;
+            Tracev((stderr, "inflate:       stored length %u\n",
+                    state->length));
+            INITBITS();
+
+            /* copy stored block from input to output */
+            while (state->length != 0) {
+                copy = state->length;
+                PULL();
+                ROOM();
+                if (copy > have) copy = have;
+                if (copy > left) copy = left;
+                zmemcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+            }
+            Tracev((stderr, "inflate:       stored end\n"));
+            state->mode = TYPE;
+            break;
+
+        case TABLE:
+            /* get dynamic table entries descriptor */
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                strm->msg = (char *)"too many length or distance symbols";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            Tracev((stderr, "inflate:       table sizes ok\n"));
+
+            /* get code length code lengths (not a typo) */
+            state->have = 0;
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (unsigned short)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (code const FAR *)(state->next);
+            state->lenbits = 7;
+            ret = inflate_table(CODES, state->lens, 19, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid code lengths set";
+                state->mode = BAD;
+                break;
+            }
+            Tracev((stderr, "inflate:       code lengths ok\n"));
+
+            /* get length and distance code code lengths */
+            state->have = 0;
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    here = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                if (here.val < 16) {
+                    DROPBITS(here.bits);
+                    state->lens[state->have++] = here.val;
+                }
+                else {
+                    if (here.val == 16) {
+                        NEEDBITS(here.bits + 2);
+                        DROPBITS(here.bits);
+                        if (state->have == 0) {
+                            strm->msg = (char *)"invalid bit length repeat";
+                            state->mode = BAD;
+                            break;
+                        }
+                        len = (unsigned)(state->lens[state->have - 1]);
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    }
+                    else if (here.val == 17) {
+                        NEEDBITS(here.bits + 3);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    }
+                    else {
+                        NEEDBITS(here.bits + 7);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        strm->msg = (char *)"invalid bit length repeat";
+                        state->mode = BAD;
+                        break;
+                    }
+                    while (copy--)
+                        state->lens[state->have++] = (unsigned short)len;
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD) break;
+
+            /* check for end-of-block code (better have one) */
+            if (state->lens[256] == 0) {
+                strm->msg = (char *)"invalid code -- missing end-of-block";
+                state->mode = BAD;
+                break;
+            }
+
+            /* build code tables -- note: do not change the lenbits or distbits
+               values here (9 and 6) without reading the comments in inftrees.h
+               concerning the ENOUGH constants, which depend on those values */
+            state->next = state->codes;
+            state->lencode = (code const FAR *)(state->next);
+            state->lenbits = 9;
+            ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid literal/lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->distcode = (code const FAR *)(state->next);
+            state->distbits = 6;
+            ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid distances set";
+                state->mode = BAD;
+                break;
+            }
+            Tracev((stderr, "inflate:       codes ok\n"));
+            state->mode = LEN;
+
+        case LEN:
+            /* use inflate_fast() if we have enough input and output */
+            if (have >= 6 && left >= 258) {
+                RESTORE();
+                if (state->whave < state->wsize)
+                    state->whave = state->wsize - left;
+                inflate_fast(strm, state->wsize);
+                LOAD();
+                break;
+            }
+
+            /* get a literal, length, or end-of-block code */
+            for (;;) {
+                here = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(here.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if (here.op && (here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->lencode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(here.bits);
+            state->length = (unsigned)here.val;
+
+            /* process literal */
+            if (here.op == 0) {
+                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+                        "inflate:         literal '%c'\n" :
+                        "inflate:         literal 0x%02x\n", here.val));
+                ROOM();
+                *put++ = (unsigned char)(state->length);
+                left--;
+                state->mode = LEN;
+                break;
+            }
+
+            /* process end of block */
+            if (here.op & 32) {
+                Tracevv((stderr, "inflate:         end of block\n"));
+                state->mode = TYPE;
+                break;
+            }
+
+            /* invalid code */
+            if (here.op & 64) {
+                strm->msg = (char *)"invalid literal/length code";
+                state->mode = BAD;
+                break;
+            }
+
+            /* length code -- get extra bits, if any */
+            state->extra = (unsigned)(here.op) & 15;
+            if (state->extra != 0) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            Tracevv((stderr, "inflate:         length %u\n", state->length));
+
+            /* get distance code */
+            for (;;) {
+                here = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(here.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if ((here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->distcode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(here.bits);
+            if (here.op & 64) {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+            state->offset = (unsigned)here.val;
+
+            /* get distance extra bits, if any */
+            state->extra = (unsigned)(here.op) & 15;
+            if (state->extra != 0) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            if (state->offset > state->wsize - (state->whave < state->wsize ?
+                                                left : 0)) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+            Tracevv((stderr, "inflate:         distance %u\n", state->offset));
+
+            /* copy match from window to output */
+            do {
+                ROOM();
+                copy = state->wsize - state->offset;
+                if (copy < left) {
+                    from = put + copy;
+                    copy = left - copy;
+                }
+                else {
+                    from = put - state->offset;
+                    copy = left;
+                }
+                if (copy > state->length) copy = state->length;
+                state->length -= copy;
+                left -= copy;
+                do {
+                    *put++ = *from++;
+                } while (--copy);
+            } while (state->length != 0);
+            break;
+
+        case DONE:
+            /* inflate stream terminated properly -- write leftover output */
+            ret = Z_STREAM_END;
+            if (left < state->wsize) {
+                if (out(out_desc, state->window, state->wsize - left))
+                    ret = Z_BUF_ERROR;
+            }
+            goto inf_leave;
+
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+
+        default:                /* can't happen, but makes compilers happy */
+            ret = Z_STREAM_ERROR;
+            goto inf_leave;
+        }
+
+    /* Return unused input */
+  inf_leave:
+    strm->next_in = next;
+    strm->avail_in = have;
+    return ret;
+}
+
+int ZEXPORT inflateBackEnd(strm)
+z_streamp strm;
+{
+    if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0)
+        return Z_STREAM_ERROR;
+    ZFREE(strm, strm->state);
+    strm->state = Z_NULL;
+    Tracev((stderr, "inflate: end\n"));
+    return Z_OK;
+}
diff --git a/deps/SZ/zlib/inffast.c b/deps/SZ/zlib/inffast.c
new file mode 100644
index 0000000000000000000000000000000000000000..0dbd1dbc09f2f69425405863bfe1080e3ca2b3f5
--- /dev/null
+++ b/deps/SZ/zlib/inffast.c
@@ -0,0 +1,323 @@
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+#ifdef ASMINF
+#  pragma message("Assembler code may have bugs -- use at your own risk")
+#else
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= 6
+        strm->avail_out >= 258
+        start >= strm->avail_out
+        state->bits < 8
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 258 for each loop to avoid checking for
+      output space.
+ */
+void ZLIB_INTERNAL inflate_fast(strm, start)
+z_streamp strm;
+unsigned start;         /* inflate()'s starting value for strm->avail_out */
+{
+    struct inflate_state FAR *state;
+    z_const unsigned char FAR *in;      /* local strm->next_in */
+    z_const unsigned char FAR *last;    /* have enough input while in < last */
+    unsigned char FAR *out;     /* local strm->next_out */
+    unsigned char FAR *beg;     /* inflate()'s initial strm->next_out */
+    unsigned char FAR *end;     /* while out < end, enough space available */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned wnext;             /* window write index */
+    unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
+    unsigned long hold;         /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const FAR *lcode;      /* local strm->lencode */
+    code const FAR *dcode;      /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    code here;                  /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char FAR *from;    /* where to copy match from */
+
+    /* copy state to local variables */
+    state = (struct inflate_state FAR *)strm->state;
+    in = strm->next_in;
+    last = in + (strm->avail_in - 5);
+    out = strm->next_out;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - 257);
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    wnext = state->wnext;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
+    do {
+        if (bits < 15) {
+            hold += (unsigned long)(*in++) << bits;
+            bits += 8;
+            hold += (unsigned long)(*in++) << bits;
+            bits += 8;
+        }
+        here = lcode[hold & lmask];
+      dolen:
+        op = (unsigned)(here.bits);
+        hold >>= op;
+        bits -= op;
+        op = (unsigned)(here.op);
+        if (op == 0) {                          /* literal */
+            Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+                    "inflate:         literal '%c'\n" :
+                    "inflate:         literal 0x%02x\n", here.val));
+            *out++ = (unsigned char)(here.val);
+        }
+        else if (op & 16) {                     /* length base */
+            len = (unsigned)(here.val);
+            op &= 15;                           /* number of extra bits */
+            if (op) {
+                if (bits < op) {
+                    hold += (unsigned long)(*in++) << bits;
+                    bits += 8;
+                }
+                len += (unsigned)hold & ((1U << op) - 1);
+                hold >>= op;
+                bits -= op;
+            }
+            Tracevv((stderr, "inflate:         length %u\n", len));
+            if (bits < 15) {
+                hold += (unsigned long)(*in++) << bits;
+                bits += 8;
+                hold += (unsigned long)(*in++) << bits;
+                bits += 8;
+            }
+            here = dcode[hold & dmask];
+          dodist:
+            op = (unsigned)(here.bits);
+            hold >>= op;
+            bits -= op;
+            op = (unsigned)(here.op);
+            if (op & 16) {                      /* distance base */
+                dist = (unsigned)(here.val);
+                op &= 15;                       /* number of extra bits */
+                if (bits < op) {
+                    hold += (unsigned long)(*in++) << bits;
+                    bits += 8;
+                    if (bits < op) {
+                        hold += (unsigned long)(*in++) << bits;
+                        bits += 8;
+                    }
+                }
+                dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    strm->msg = (char *)"invalid distance too far back";
+                    state->mode = BAD;
+                    break;
+                }
+#endif
+                hold >>= op;
+                bits -= op;
+                Tracevv((stderr, "inflate:         distance %u\n", dist));
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        if (state->sane) {
+                            strm->msg =
+                                (char *)"invalid distance too far back";
+                            state->mode = BAD;
+                            break;
+                        }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                        if (len <= op - whave) {
+                            do {
+                                *out++ = 0;
+                            } while (--len);
+                            continue;
+                        }
+                        len -= op - whave;
+                        do {
+                            *out++ = 0;
+                        } while (--op > whave);
+                        if (op == 0) {
+                            from = out - dist;
+                            do {
+                                *out++ = *from++;
+                            } while (--len);
+                            continue;
+                        }
+#endif
+                    }
+                    from = window;
+                    if (wnext == 0) {           /* very common case */
+                        from += wsize - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                *out++ = *from++;
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    else if (wnext < op) {      /* wrap around window */
+                        from += wsize + wnext - op;
+                        op -= wnext;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            do {
+                                *out++ = *from++;
+                            } while (--op);
+                            from = window;
+                            if (wnext < len) {  /* some from start of window */
+                                op = wnext;
+                                len -= op;
+                                do {
+                                    *out++ = *from++;
+                                } while (--op);
+                                from = out - dist;      /* rest from output */
+                            }
+                        }
+                    }
+                    else {                      /* contiguous in window */
+                        from += wnext - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                *out++ = *from++;
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    while (len > 2) {
+                        *out++ = *from++;
+                        *out++ = *from++;
+                        *out++ = *from++;
+                        len -= 3;
+                    }
+                    if (len) {
+                        *out++ = *from++;
+                        if (len > 1)
+                            *out++ = *from++;
+                    }
+                }
+                else {
+                    from = out - dist;          /* copy direct from output */
+                    do {                        /* minimum length is three */
+                        *out++ = *from++;
+                        *out++ = *from++;
+                        *out++ = *from++;
+                        len -= 3;
+                    } while (len > 2);
+                    if (len) {
+                        *out++ = *from++;
+                        if (len > 1)
+                            *out++ = *from++;
+                    }
+                }
+            }
+            else if ((op & 64) == 0) {          /* 2nd level distance code */
+                here = dcode[here.val + (hold & ((1U << op) - 1))];
+                goto dodist;
+            }
+            else {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+        }
+        else if ((op & 64) == 0) {              /* 2nd level length code */
+            here = lcode[here.val + (hold & ((1U << op) - 1))];
+            goto dolen;
+        }
+        else if (op & 32) {                     /* end-of-block */
+            Tracevv((stderr, "inflate:         end of block\n"));
+            state->mode = TYPE;
+            break;
+        }
+        else {
+            strm->msg = (char *)"invalid literal/length code";
+            state->mode = BAD;
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (1U << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in;
+    strm->next_out = out;
+    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+    strm->avail_out = (unsigned)(out < end ?
+                                 257 + (end - out) : 257 - (out - end));
+    state->hold = hold;
+    state->bits = bits;
+    return;
+}
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and wnext == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
diff --git a/deps/SZ/zlib/inffast.h b/deps/SZ/zlib/inffast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5c1aa4ca8cd5244423680865609c71ab68f9ab6
--- /dev/null
+++ b/deps/SZ/zlib/inffast.h
@@ -0,0 +1,11 @@
+/* inffast.h -- header to use inffast.c
+ * Copyright (C) 1995-2003, 2010 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));
diff --git a/deps/SZ/zlib/inffixed.h b/deps/SZ/zlib/inffixed.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6283277694802ce7938f537f12990d6eead4924
--- /dev/null
+++ b/deps/SZ/zlib/inffixed.h
@@ -0,0 +1,94 @@
+    /* inffixed.h -- table for decoding fixed codes
+     * Generated automatically by makefixed().
+     */
+
+    /* WARNING: this file should *not* be used by applications.
+       It is part of the implementation of this library and is
+       subject to change. Applications should only use zlib.h.
+     */
+
+    static const code lenfix[512] = {
+        {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+        {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+        {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+        {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+        {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+        {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+        {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+        {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+        {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+        {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+        {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+        {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+        {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+        {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+        {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+        {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+        {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+        {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+        {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+        {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+        {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+        {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+        {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+        {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+        {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+        {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+        {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+        {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+        {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+        {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+        {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+        {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+        {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+        {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+        {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+        {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+        {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+        {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+        {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+        {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+        {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+        {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+        {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+        {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+        {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+        {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+        {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+        {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+        {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+        {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+        {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+        {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+        {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+        {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+        {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+        {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+        {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+        {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+        {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+        {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+        {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+        {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+        {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+        {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+        {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+        {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+        {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+        {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+        {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+        {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+        {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+        {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+        {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+        {0,9,255}
+    };
+
+    static const code distfix[32] = {
+        {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+        {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+        {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+        {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+        {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+        {22,5,193},{64,5,0}
+    };
diff --git a/deps/SZ/zlib/inflate.c b/deps/SZ/zlib/inflate.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac333e8c2edae90ec1145d06d9852002dd5d0617
--- /dev/null
+++ b/deps/SZ/zlib/inflate.c
@@ -0,0 +1,1561 @@
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+ * Change history:
+ *
+ * 1.2.beta0    24 Nov 2002
+ * - First version -- complete rewrite of inflate to simplify code, avoid
+ *   creation of window when not needed, minimize use of window when it is
+ *   needed, make inffast.c even faster, implement gzip decoding, and to
+ *   improve code readability and style over the previous zlib inflate code
+ *
+ * 1.2.beta1    25 Nov 2002
+ * - Use pointers for available input and output checking in inffast.c
+ * - Remove input and output counters in inffast.c
+ * - Change inffast.c entry and loop from avail_in >= 7 to >= 6
+ * - Remove unnecessary second byte pull from length extra in inffast.c
+ * - Unroll direct copy to three copies per loop in inffast.c
+ *
+ * 1.2.beta2    4 Dec 2002
+ * - Change external routine names to reduce potential conflicts
+ * - Correct filename to inffixed.h for fixed tables in inflate.c
+ * - Make hbuf[] unsigned char to match parameter type in inflate.c
+ * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset)
+ *   to avoid negation problem on Alphas (64 bit) in inflate.c
+ *
+ * 1.2.beta3    22 Dec 2002
+ * - Add comments on state->bits assertion in inffast.c
+ * - Add comments on op field in inftrees.h
+ * - Fix bug in reuse of allocated window after inflateReset()
+ * - Remove bit fields--back to byte structure for speed
+ * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths
+ * - Change post-increments to pre-increments in inflate_fast(), PPC biased?
+ * - Add compile time option, POSTINC, to use post-increments instead (Intel?)
+ * - Make MATCH copy in inflate() much faster for when inflate_fast() not used
+ * - Use local copies of stream next and avail values, as well as local bit
+ *   buffer and bit count in inflate()--for speed when inflate_fast() not used
+ *
+ * 1.2.beta4    1 Jan 2003
+ * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings
+ * - Move a comment on output buffer sizes from inffast.c to inflate.c
+ * - Add comments in inffast.c to introduce the inflate_fast() routine
+ * - Rearrange window copies in inflate_fast() for speed and simplification
+ * - Unroll last copy for window match in inflate_fast()
+ * - Use local copies of window variables in inflate_fast() for speed
+ * - Pull out common wnext == 0 case for speed in inflate_fast()
+ * - Make op and len in inflate_fast() unsigned for consistency
+ * - Add FAR to lcode and dcode declarations in inflate_fast()
+ * - Simplified bad distance check in inflate_fast()
+ * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new
+ *   source file infback.c to provide a call-back interface to inflate for
+ *   programs like gzip and unzip -- uses window as output buffer to avoid
+ *   window copying
+ *
+ * 1.2.beta5    1 Jan 2003
+ * - Improved inflateBack() interface to allow the caller to provide initial
+ *   input in strm.
+ * - Fixed stored blocks bug in inflateBack()
+ *
+ * 1.2.beta6    4 Jan 2003
+ * - Added comments in inffast.c on effectiveness of POSTINC
+ * - Typecasting all around to reduce compiler warnings
+ * - Changed loops from while (1) or do {} while (1) to for (;;), again to
+ *   make compilers happy
+ * - Changed type of window in inflateBackInit() to unsigned char *
+ *
+ * 1.2.beta7    27 Jan 2003
+ * - Changed many types to unsigned or unsigned short to avoid warnings
+ * - Added inflateCopy() function
+ *
+ * 1.2.0        9 Mar 2003
+ * - Changed inflateBack() interface to provide separate opaque descriptors
+ *   for the in() and out() functions
+ * - Changed inflateBack() argument and in_func typedef to swap the length
+ *   and buffer address return values for the input function
+ * - Check next_in and next_out for Z_NULL on entry to inflate()
+ *
+ * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
+ */
+
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
+
+#ifdef MAKEFIXED
+#  ifndef BUILDFIXED
+#    define BUILDFIXED
+#  endif
+#endif
+
+/* function prototypes */
+local int inflateStateCheck OF((z_streamp strm));
+local void fixedtables OF((struct inflate_state FAR *state));
+local int updatewindow OF((z_streamp strm, const unsigned char FAR *end,
+                           unsigned copy));
+#ifdef BUILDFIXED
+   void makefixed OF((void));
+#endif
+local unsigned syncsearch OF((unsigned FAR *have, const unsigned char FAR *buf,
+                              unsigned len));
+
+local int inflateStateCheck(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+    if (strm == Z_NULL ||
+        strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
+        return 1;
+    state = (struct inflate_state FAR *)strm->state;
+    if (state == Z_NULL || state->strm != strm ||
+        state->mode < HEAD || state->mode > SYNC)
+        return 1;
+    return 0;
+}
+
+int ZEXPORT inflateResetKeep(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    strm->total_in = strm->total_out = state->total = 0;
+    strm->msg = Z_NULL;
+    if (state->wrap)        /* to support ill-conceived Java test suite */
+        strm->adler = state->wrap & 1;
+    state->mode = HEAD;
+    state->last = 0;
+    state->havedict = 0;
+    state->dmax = 32768U;
+    state->head = Z_NULL;
+    state->hold = 0;
+    state->bits = 0;
+    state->lencode = state->distcode = state->next = state->codes;
+    state->sane = 1;
+    state->back = -1;
+    Tracev((stderr, "inflate: reset\n"));
+    return Z_OK;
+}
+
+int ZEXPORT inflateReset(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    state->wsize = 0;
+    state->whave = 0;
+    state->wnext = 0;
+    return inflateResetKeep(strm);
+}
+
+int ZEXPORT inflateReset2(strm, windowBits)
+z_streamp strm;
+int windowBits;
+{
+    int wrap;
+    struct inflate_state FAR *state;
+
+    /* get the state */
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* extract wrap request from windowBits parameter */
+    if (windowBits < 0) {
+        wrap = 0;
+        windowBits = -windowBits;
+    }
+    else {
+        wrap = (windowBits >> 4) + 5;
+#ifdef GUNZIP
+        if (windowBits < 48)
+            windowBits &= 15;
+#endif
+    }
+
+    /* set number of window bits, free window if different */
+    if (windowBits && (windowBits < 8 || windowBits > 15))
+        return Z_STREAM_ERROR;
+    if (state->window != Z_NULL && state->wbits != (unsigned)windowBits) {
+        ZFREE(strm, state->window);
+        state->window = Z_NULL;
+    }
+
+    /* update state and reset the rest of it */
+    state->wrap = wrap;
+    state->wbits = (unsigned)windowBits;
+    return inflateReset(strm);
+}
+
+int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
+z_streamp strm;
+int windowBits;
+const char *version;
+int stream_size;
+{
+    int ret;
+    struct inflate_state FAR *state;
+
+    if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
+        stream_size != (int)(sizeof(z_stream)))
+        return Z_VERSION_ERROR;
+    if (strm == Z_NULL) return Z_STREAM_ERROR;
+    strm->msg = Z_NULL;                 /* in case we return an error */
+    if (strm->zalloc == (alloc_func)0) {
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zalloc = zcalloc;
+        strm->opaque = (voidpf)0;
+#endif
+    }
+    if (strm->zfree == (free_func)0)
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zfree = zcfree;
+#endif
+    state = (struct inflate_state FAR *)
+            ZALLOC(strm, 1, sizeof(struct inflate_state));
+    if (state == Z_NULL) return Z_MEM_ERROR;
+    Tracev((stderr, "inflate: allocated\n"));
+    strm->state = (struct internal_state FAR *)state;
+    state->strm = strm;
+    state->window = Z_NULL;
+    state->mode = HEAD;     /* to pass state test in inflateReset2() */
+    ret = inflateReset2(strm, windowBits);
+    if (ret != Z_OK) {
+        ZFREE(strm, state);
+        strm->state = Z_NULL;
+    }
+    return ret;
+}
+
+int ZEXPORT inflateInit_(strm, version, stream_size)
+z_streamp strm;
+const char *version;
+int stream_size;
+{
+    return inflateInit2_(strm, DEF_WBITS, version, stream_size);
+}
+
+int ZEXPORT inflatePrime(strm, bits, value)
+z_streamp strm;
+int bits;
+int value;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (bits < 0) {
+        state->hold = 0;
+        state->bits = 0;
+        return Z_OK;
+    }
+    if (bits > 16 || state->bits + (uInt)bits > 32) return Z_STREAM_ERROR;
+    value &= (1L << bits) - 1;
+    state->hold += (unsigned)value << state->bits;
+    state->bits += (uInt)bits;
+    return Z_OK;
+}
+
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  Normally this returns fixed tables from inffixed.h.
+   If BUILDFIXED is defined, then instead this routine builds the tables the
+   first time it's called, and returns those tables the first time and
+   thereafter.  This reduces the size of the code by about 2K bytes, in
+   exchange for a little execution time.  However, BUILDFIXED should not be
+   used for threaded applications, since the rewriting of the tables and virgin
+   may not be thread-safe.
+ */
+local void fixedtables(state)
+struct inflate_state FAR *state;
+{
+#ifdef BUILDFIXED
+    static int virgin = 1;
+    static code *lenfix, *distfix;
+    static code fixed[544];
+
+    /* build fixed huffman tables if first call (may not be thread safe) */
+    if (virgin) {
+        unsigned sym, bits;
+        static code *next;
+
+        /* literal/length table */
+        sym = 0;
+        while (sym < 144) state->lens[sym++] = 8;
+        while (sym < 256) state->lens[sym++] = 9;
+        while (sym < 280) state->lens[sym++] = 7;
+        while (sym < 288) state->lens[sym++] = 8;
+        next = fixed;
+        lenfix = next;
+        bits = 9;
+        inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work);
+
+        /* distance table */
+        sym = 0;
+        while (sym < 32) state->lens[sym++] = 5;
+        distfix = next;
+        bits = 5;
+        inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work);
+
+        /* do this just once */
+        virgin = 0;
+    }
+#else /* !BUILDFIXED */
+#   include "inffixed.h"
+#endif /* BUILDFIXED */
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
+}
+
+#ifdef MAKEFIXED
+#include <stdio.h>
+
+/*
+   Write out the inffixed.h that is #include'd above.  Defining MAKEFIXED also
+   defines BUILDFIXED, so the tables are built on the fly.  makefixed() writes
+   those tables to stdout, which would be piped to inffixed.h.  A small program
+   can simply call makefixed to do this:
+
+    void makefixed(void);
+
+    int main(void)
+    {
+        makefixed();
+        return 0;
+    }
+
+   Then that can be linked with zlib built with MAKEFIXED defined and run:
+
+    a.out > inffixed.h
+ */
+void makefixed()
+{
+    unsigned low, size;
+    struct inflate_state state;
+
+    fixedtables(&state);
+    puts("    /* inffixed.h -- table for decoding fixed codes");
+    puts("     * Generated automatically by makefixed().");
+    puts("     */");
+    puts("");
+    puts("    /* WARNING: this file should *not* be used by applications.");
+    puts("       It is part of the implementation of this library and is");
+    puts("       subject to change. Applications should only use zlib.h.");
+    puts("     */");
+    puts("");
+    size = 1U << 9;
+    printf("    static const code lenfix[%u] = {", size);
+    low = 0;
+    for (;;) {
+        if ((low % 7) == 0) printf("\n        ");
+        printf("{%u,%u,%d}", (low & 127) == 99 ? 64 : state.lencode[low].op,
+               state.lencode[low].bits, state.lencode[low].val);
+        if (++low == size) break;
+        putchar(',');
+    }
+    puts("\n    };");
+    size = 1U << 5;
+    printf("\n    static const code distfix[%u] = {", size);
+    low = 0;
+    for (;;) {
+        if ((low % 6) == 0) printf("\n        ");
+        printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits,
+               state.distcode[low].val);
+        if (++low == size) break;
+        putchar(',');
+    }
+    puts("\n    };");
+}
+#endif /* MAKEFIXED */
+
+/*
+   Update the window with the last wsize (normally 32K) bytes written before
+   returning.  If window does not exist yet, create it.  This is only called
+   when a window is already in use, or when output has been written during this
+   inflate call, but the end of the deflate stream has not been reached yet.
+   It is also called to create a window for dictionary data when a dictionary
+   is loaded.
+
+   Providing output buffers larger than 32K to inflate() should provide a speed
+   advantage, since only the last 32K of output is copied to the sliding window
+   upon return from inflate(), and since all distances after the first 32K of
+   output will fall in the output data, making match copies simpler and faster.
+   The advantage may be dependent on the size of the processor's data caches.
+ */
+local int updatewindow(strm, end, copy)
+z_streamp strm;
+const Bytef *end;
+unsigned copy;
+{
+    struct inflate_state FAR *state;
+    unsigned dist;
+
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* if it hasn't been done already, allocate space for the window */
+    if (state->window == Z_NULL) {
+        state->window = (unsigned char FAR *)
+                        ZALLOC(strm, 1U << state->wbits,
+                               sizeof(unsigned char));
+        if (state->window == Z_NULL) return 1;
+    }
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0) {
+        state->wsize = 1U << state->wbits;
+        state->wnext = 0;
+        state->whave = 0;
+    }
+
+    /* copy state->wsize or less output bytes into the circular window */
+    if (copy >= state->wsize) {
+        zmemcpy(state->window, end - state->wsize, state->wsize);
+        state->wnext = 0;
+        state->whave = state->wsize;
+    }
+    else {
+        dist = state->wsize - state->wnext;
+        if (dist > copy) dist = copy;
+        zmemcpy(state->window + state->wnext, end - copy, dist);
+        copy -= dist;
+        if (copy) {
+            zmemcpy(state->window, end - copy, copy);
+            state->wnext = copy;
+            state->whave = state->wsize;
+        }
+        else {
+            state->wnext += dist;
+            if (state->wnext == state->wsize) state->wnext = 0;
+            if (state->whave < state->wsize) state->whave += dist;
+        }
+    }
+    return 0;
+}
+
+/* Macros for inflate(): */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#ifdef GUNZIP
+#  define UPDATE(check, buf, len) \
+    (state->flags ? crc32(check, buf, len) : adler32(check, buf, len))
+#else
+#  define UPDATE(check, buf, len) adler32(check, buf, len)
+#endif
+
+/* check macros for header crc */
+#ifdef GUNZIP
+#  define CRC2(check, word) \
+    do { \
+        hbuf[0] = (unsigned char)(word); \
+        hbuf[1] = (unsigned char)((word) >> 8); \
+        check = crc32(check, hbuf, 2); \
+    } while (0)
+
+#  define CRC4(check, word) \
+    do { \
+        hbuf[0] = (unsigned char)(word); \
+        hbuf[1] = (unsigned char)((word) >> 8); \
+        hbuf[2] = (unsigned char)((word) >> 16); \
+        hbuf[3] = (unsigned char)((word) >> 24); \
+        check = crc32(check, hbuf, 4); \
+    } while (0)
+#endif
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflate()
+   if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        if (have == 0) goto inf_leave; \
+        have--; \
+        hold += (unsigned long)(*next++) << bits; \
+        bits += 8; \
+    } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflate(). */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/*
+   inflate() uses a state machine to process as much input data and generate as
+   much output data as possible before returning.  The state machine is
+   structured roughly as follows:
+
+    for (;;) switch (state) {
+    ...
+    case STATEn:
+        if (not enough input data or output space to make progress)
+            return;
+        ... make progress ...
+        state = STATEm;
+        break;
+    ...
+    }
+
+   so when inflate() is called again, the same case is attempted again, and
+   if the appropriate resources are provided, the machine proceeds to the
+   next state.  The NEEDBITS() macro is usually the way the state evaluates
+   whether it can proceed or should return.  NEEDBITS() does the return if
+   the requested bits are not available.  The typical use of the BITS macros
+   is:
+
+        NEEDBITS(n);
+        ... do something with BITS(n) ...
+        DROPBITS(n);
+
+   where NEEDBITS(n) either returns from inflate() if there isn't enough
+   input left to load n bits into the accumulator, or it continues.  BITS(n)
+   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
+   the low n bits off the accumulator.  INITBITS() clears the accumulator
+   and sets the number of available bits to zero.  BYTEBITS() discards just
+   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
+   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+   if there is no input available.  The decoding of variable length codes uses
+   PULLBYTE() directly in order to pull just enough bytes to decode the next
+   code, and no more.
+
+   Some states loop until they get enough input, making sure that enough
+   state information is maintained to continue the loop where it left off
+   if NEEDBITS() returns in the loop.  For example, want, need, and keep
+   would all have to actually be part of the saved state in case NEEDBITS()
+   returns:
+
+    case STATEw:
+        while (want < need) {
+            NEEDBITS(n);
+            keep[want++] = BITS(n);
+            DROPBITS(n);
+        }
+        state = STATEx;
+    case STATEx:
+
+   As shown above, if the next state is also the next case, then the break
+   is omitted.
+
+   A state may also return if there is not enough output space available to
+   complete that state.  Those states are copying stored data, writing a
+   literal byte, and copying a matching string.
+
+   When returning, a "goto inf_leave" is used to update the total counters,
+   update the check value, and determine whether any progress has been made
+   during that inflate() call in order to return the proper return code.
+   Progress is defined as a change in either strm->avail_in or strm->avail_out.
+   When there is a window, goto inf_leave will update the window with the last
+   output written.  If a goto inf_leave occurs in the middle of decompression
+   and there is no window currently, goto inf_leave will create one and copy
+   output to the window for the next call of inflate().
+
+   In this implementation, the flush parameter of inflate() only affects the
+   return code (per zlib.h).  inflate() always writes as much as possible to
+   strm->next_out, given the space available and the provided input--the effect
+   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
+   the allocation of and copying into a sliding window until necessary, which
+   provides the effect documented in zlib.h for Z_FINISH when the entire input
+   stream available.  So the only thing the flush parameter actually does is:
+   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
+   will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int ZEXPORT inflate(strm, flush)
+z_streamp strm;
+int flush;
+{
+    struct inflate_state FAR *state;
+    z_const unsigned char FAR *next;    /* next input */
+    unsigned char FAR *put;     /* next output */
+    unsigned have, left;        /* available input and output */
+    unsigned long hold;         /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    unsigned in, out;           /* save starting available input and output */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char FAR *from;    /* where to copy match bytes from */
+    code here;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int ret;                    /* return code */
+#ifdef GUNZIP
+    unsigned char hbuf[4];      /* buffer for gzip header crc calculation */
+#endif
+    static const unsigned short order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    if (inflateStateCheck(strm) || strm->next_out == Z_NULL ||
+        (strm->next_in == Z_NULL && strm->avail_in != 0))
+        return Z_STREAM_ERROR;
+
+    state = (struct inflate_state FAR *)strm->state;
+    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
+    LOAD();
+    in = have;
+    out = left;
+    ret = Z_OK;
+    for (;;)
+        switch (state->mode) {
+        case HEAD:
+            if (state->wrap == 0) {
+                state->mode = TYPEDO;
+                break;
+            }
+            NEEDBITS(16);
+#ifdef GUNZIP
+            if ((state->wrap & 2) && hold == 0x8b1f) {  /* gzip header */
+                if (state->wbits == 0)
+                    state->wbits = 15;
+                state->check = crc32(0L, Z_NULL, 0);
+                CRC2(state->check, hold);
+                INITBITS();
+                state->mode = FLAGS;
+                break;
+            }
+            state->flags = 0;           /* expect zlib header */
+            if (state->head != Z_NULL)
+                state->head->done = -1;
+            if (!(state->wrap & 1) ||   /* check if zlib header allowed */
+#else
+            if (
+#endif
+                ((BITS(8) << 8) + (hold >> 8)) % 31) {
+                strm->msg = (char *)"incorrect header check";
+                state->mode = BAD;
+                break;
+            }
+            if (BITS(4) != Z_DEFLATED) {
+                strm->msg = (char *)"unknown compression method";
+                state->mode = BAD;
+                break;
+            }
+            DROPBITS(4);
+            len = BITS(4) + 8;
+            if (state->wbits == 0)
+                state->wbits = len;
+            if (len > 15 || len > state->wbits) {
+                strm->msg = (char *)"invalid window size";
+                state->mode = BAD;
+                break;
+            }
+            state->dmax = 1U << len;
+            Tracev((stderr, "inflate:   zlib header ok\n"));
+            strm->adler = state->check = adler32(0L, Z_NULL, 0);
+            state->mode = hold & 0x200 ? DICTID : TYPE;
+            INITBITS();
+            break;
+#ifdef GUNZIP
+        case FLAGS:
+            NEEDBITS(16);
+            state->flags = (int)(hold);
+            if ((state->flags & 0xff) != Z_DEFLATED) {
+                strm->msg = (char *)"unknown compression method";
+                state->mode = BAD;
+                break;
+            }
+            if (state->flags & 0xe000) {
+                strm->msg = (char *)"unknown header flags set";
+                state->mode = BAD;
+                break;
+            }
+            if (state->head != Z_NULL)
+                state->head->text = (int)((hold >> 8) & 1);
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC2(state->check, hold);
+            INITBITS();
+            state->mode = TIME;
+        case TIME:
+            NEEDBITS(32);
+            if (state->head != Z_NULL)
+                state->head->time = hold;
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC4(state->check, hold);
+            INITBITS();
+            state->mode = OS;
+        case OS:
+            NEEDBITS(16);
+            if (state->head != Z_NULL) {
+                state->head->xflags = (int)(hold & 0xff);
+                state->head->os = (int)(hold >> 8);
+            }
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC2(state->check, hold);
+            INITBITS();
+            state->mode = EXLEN;
+        case EXLEN:
+            if (state->flags & 0x0400) {
+                NEEDBITS(16);
+                state->length = (unsigned)(hold);
+                if (state->head != Z_NULL)
+                    state->head->extra_len = (unsigned)hold;
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    CRC2(state->check, hold);
+                INITBITS();
+            }
+            else if (state->head != Z_NULL)
+                state->head->extra = Z_NULL;
+            state->mode = EXTRA;
+        case EXTRA:
+            if (state->flags & 0x0400) {
+                copy = state->length;
+                if (copy > have) copy = have;
+                if (copy) {
+                    if (state->head != Z_NULL &&
+                        state->head->extra != Z_NULL) {
+                        len = state->head->extra_len - state->length;
+                        zmemcpy(state->head->extra + len, next,
+                                len + copy > state->head->extra_max ?
+                                state->head->extra_max - len : copy);
+                    }
+                    if ((state->flags & 0x0200) && (state->wrap & 4))
+                        state->check = crc32(state->check, next, copy);
+                    have -= copy;
+                    next += copy;
+                    state->length -= copy;
+                }
+                if (state->length) goto inf_leave;
+            }
+            state->length = 0;
+            state->mode = NAME;
+        case NAME:
+            if (state->flags & 0x0800) {
+                if (have == 0) goto inf_leave;
+                copy = 0;
+                do {
+                    len = (unsigned)(next[copy++]);
+                    if (state->head != Z_NULL &&
+                            state->head->name != Z_NULL &&
+                            state->length < state->head->name_max)
+                        state->head->name[state->length++] = (Bytef)len;
+                } while (len && copy < have);
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    state->check = crc32(state->check, next, copy);
+                have -= copy;
+                next += copy;
+                if (len) goto inf_leave;
+            }
+            else if (state->head != Z_NULL)
+                state->head->name = Z_NULL;
+            state->length = 0;
+            state->mode = COMMENT;
+        case COMMENT:
+            if (state->flags & 0x1000) {
+                if (have == 0) goto inf_leave;
+                copy = 0;
+                do {
+                    len = (unsigned)(next[copy++]);
+                    if (state->head != Z_NULL &&
+                            state->head->comment != Z_NULL &&
+                            state->length < state->head->comm_max)
+                        state->head->comment[state->length++] = (Bytef)len;
+                } while (len && copy < have);
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    state->check = crc32(state->check, next, copy);
+                have -= copy;
+                next += copy;
+                if (len) goto inf_leave;
+            }
+            else if (state->head != Z_NULL)
+                state->head->comment = Z_NULL;
+            state->mode = HCRC;
+        case HCRC:
+            if (state->flags & 0x0200) {
+                NEEDBITS(16);
+                if ((state->wrap & 4) && hold != (state->check & 0xffff)) {
+                    strm->msg = (char *)"header crc mismatch";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+            }
+            if (state->head != Z_NULL) {
+                state->head->hcrc = (int)((state->flags >> 9) & 1);
+                state->head->done = 1;
+            }
+            strm->adler = state->check = crc32(0L, Z_NULL, 0);
+            state->mode = TYPE;
+            break;
+#endif
+        case DICTID:
+            NEEDBITS(32);
+            strm->adler = state->check = ZSWAP32(hold);
+            INITBITS();
+            state->mode = DICT;
+        case DICT:
+            if (state->havedict == 0) {
+                RESTORE();
+                return Z_NEED_DICT;
+            }
+            strm->adler = state->check = adler32(0L, Z_NULL, 0);
+            state->mode = TYPE;
+        case TYPE:
+            if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave;
+        case TYPEDO:
+            if (state->last) {
+                BYTEBITS();
+                state->mode = CHECK;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                Tracev((stderr, "inflate:     stored block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                fixedtables(state);
+                Tracev((stderr, "inflate:     fixed codes block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = LEN_;             /* decode codes */
+                if (flush == Z_TREES) {
+                    DROPBITS(2);
+                    goto inf_leave;
+                }
+                break;
+            case 2:                             /* dynamic block */
+                Tracev((stderr, "inflate:     dynamic codes block%s\n",
+                        state->last ? " (last)" : ""));
+                state->mode = TABLE;
+                break;
+            case 3:
+                strm->msg = (char *)"invalid block type";
+                state->mode = BAD;
+            }
+            DROPBITS(2);
+            break;
+        case STORED:
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                strm->msg = (char *)"invalid stored block lengths";
+                state->mode = BAD;
+                break;
+            }
+            state->length = (unsigned)hold & 0xffff;
+            Tracev((stderr, "inflate:       stored length %u\n",
+                    state->length));
+            INITBITS();
+            state->mode = COPY_;
+            if (flush == Z_TREES) goto inf_leave;
+        case COPY_:
+            state->mode = COPY;
+        case COPY:
+            copy = state->length;
+            if (copy) {
+                if (copy > have) copy = have;
+                if (copy > left) copy = left;
+                if (copy == 0) goto inf_leave;
+                zmemcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+                break;
+            }
+            Tracev((stderr, "inflate:       stored end\n"));
+            state->mode = TYPE;
+            break;
+        case TABLE:
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                strm->msg = (char *)"too many length or distance symbols";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            Tracev((stderr, "inflate:       table sizes ok\n"));
+            state->have = 0;
+            state->mode = LENLENS;
+        case LENLENS:
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (unsigned short)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (const code FAR *)(state->next);
+            state->lenbits = 7;
+            ret = inflate_table(CODES, state->lens, 19, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid code lengths set";
+                state->mode = BAD;
+                break;
+            }
+            Tracev((stderr, "inflate:       code lengths ok\n"));
+            state->have = 0;
+            state->mode = CODELENS;
+        case CODELENS:
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    here = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                if (here.val < 16) {
+                    DROPBITS(here.bits);
+                    state->lens[state->have++] = here.val;
+                }
+                else {
+                    if (here.val == 16) {
+                        NEEDBITS(here.bits + 2);
+                        DROPBITS(here.bits);
+                        if (state->have == 0) {
+                            strm->msg = (char *)"invalid bit length repeat";
+                            state->mode = BAD;
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    }
+                    else if (here.val == 17) {
+                        NEEDBITS(here.bits + 3);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    }
+                    else {
+                        NEEDBITS(here.bits + 7);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        strm->msg = (char *)"invalid bit length repeat";
+                        state->mode = BAD;
+                        break;
+                    }
+                    while (copy--)
+                        state->lens[state->have++] = (unsigned short)len;
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD) break;
+
+            /* check for end-of-block code (better have one) */
+            if (state->lens[256] == 0) {
+                strm->msg = (char *)"invalid code -- missing end-of-block";
+                state->mode = BAD;
+                break;
+            }
+
+            /* build code tables -- note: do not change the lenbits or distbits
+               values here (9 and 6) without reading the comments in inftrees.h
+               concerning the ENOUGH constants, which depend on those values */
+            state->next = state->codes;
+            state->lencode = (const code FAR *)(state->next);
+            state->lenbits = 9;
+            ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid literal/lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->distcode = (const code FAR *)(state->next);
+            state->distbits = 6;
+            ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid distances set";
+                state->mode = BAD;
+                break;
+            }
+            Tracev((stderr, "inflate:       codes ok\n"));
+            state->mode = LEN_;
+            if (flush == Z_TREES) goto inf_leave;
+        case LEN_:
+            state->mode = LEN;
+        case LEN:
+            if (have >= 6 && left >= 258) {
+                RESTORE();
+                inflate_fast(strm, out);
+                LOAD();
+                if (state->mode == TYPE)
+                    state->back = -1;
+                break;
+            }
+            state->back = 0;
+            for (;;) {
+                here = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(here.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if (here.op && (here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->lencode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+                state->back += last.bits;
+            }
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            state->length = (unsigned)here.val;
+            if ((int)(here.op) == 0) {
+                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+                        "inflate:         literal '%c'\n" :
+                        "inflate:         literal 0x%02x\n", here.val));
+                state->mode = LIT;
+                break;
+            }
+            if (here.op & 32) {
+                Tracevv((stderr, "inflate:         end of block\n"));
+                state->back = -1;
+                state->mode = TYPE;
+                break;
+            }
+            if (here.op & 64) {
+                strm->msg = (char *)"invalid literal/length code";
+                state->mode = BAD;
+                break;
+            }
+            state->extra = (unsigned)(here.op) & 15;
+            state->mode = LENEXT;
+        case LENEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+                state->back += state->extra;
+            }
+            Tracevv((stderr, "inflate:         length %u\n", state->length));
+            state->was = state->length;
+            state->mode = DIST;
+        case DIST:
+            for (;;) {
+                here = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(here.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if ((here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->distcode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+                state->back += last.bits;
+            }
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            if (here.op & 64) {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+            state->offset = (unsigned)here.val;
+            state->extra = (unsigned)(here.op) & 15;
+            state->mode = DISTEXT;
+        case DISTEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+                state->back += state->extra;
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->dmax) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            Tracevv((stderr, "inflate:         distance %u\n", state->offset));
+            state->mode = MATCH;
+        case MATCH:
+            if (left == 0) goto inf_leave;
+            copy = out - left;
+            if (state->offset > copy) {         /* copy from window */
+                copy = state->offset - copy;
+                if (copy > state->whave) {
+                    if (state->sane) {
+                        strm->msg = (char *)"invalid distance too far back";
+                        state->mode = BAD;
+                        break;
+                    }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                    Trace((stderr, "inflate.c too far\n"));
+                    copy -= state->whave;
+                    if (copy > state->length) copy = state->length;
+                    if (copy > left) copy = left;
+                    left -= copy;
+                    state->length -= copy;
+                    do {
+                        *put++ = 0;
+                    } while (--copy);
+                    if (state->length == 0) state->mode = LEN;
+                    break;
+#endif
+                }
+                if (copy > state->wnext) {
+                    copy -= state->wnext;
+                    from = state->window + (state->wsize - copy);
+                }
+                else
+                    from = state->window + (state->wnext - copy);
+                if (copy > state->length) copy = state->length;
+            }
+            else {                              /* copy from output */
+                from = put - state->offset;
+                copy = state->length;
+            }
+            if (copy > left) copy = left;
+            left -= copy;
+            state->length -= copy;
+            do {
+                *put++ = *from++;
+            } while (--copy);
+            if (state->length == 0) state->mode = LEN;
+            break;
+        case LIT:
+            if (left == 0) goto inf_leave;
+            *put++ = (unsigned char)(state->length);
+            left--;
+            state->mode = LEN;
+            break;
+        case CHECK:
+            if (state->wrap) {
+                NEEDBITS(32);
+                out -= left;
+                strm->total_out += out;
+                state->total += out;
+                if ((state->wrap & 4) && out)
+                    strm->adler = state->check =
+                        UPDATE(state->check, put - out, out);
+                out = left;
+                if ((state->wrap & 4) && (
+#ifdef GUNZIP
+                     state->flags ? hold :
+#endif
+                     ZSWAP32(hold)) != state->check) {
+                    strm->msg = (char *)"incorrect data check";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+                Tracev((stderr, "inflate:   check matches trailer\n"));
+            }
+#ifdef GUNZIP
+            state->mode = LENGTH;
+        case LENGTH:
+            if (state->wrap && state->flags) {
+                NEEDBITS(32);
+                if (hold != (state->total & 0xffffffffUL)) {
+                    strm->msg = (char *)"incorrect length check";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+                Tracev((stderr, "inflate:   length matches trailer\n"));
+            }
+#endif
+            state->mode = DONE;
+        case DONE:
+            ret = Z_STREAM_END;
+            goto inf_leave;
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+        case MEM:
+            return Z_MEM_ERROR;
+        case SYNC:
+        default:
+            return Z_STREAM_ERROR;
+        }
+
+    /*
+       Return from inflate(), updating the total counts and the check value.
+       If there was no progress during the inflate() call, return a buffer
+       error.  Call updatewindow() to create and/or update the window state.
+       Note: a memory error from inflate() is non-recoverable.
+     */
+  inf_leave:
+    RESTORE();
+    if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
+            (state->mode < CHECK || flush != Z_FINISH)))
+        if (updatewindow(strm, strm->next_out, out - strm->avail_out)) {
+            state->mode = MEM;
+            return Z_MEM_ERROR;
+        }
+    in -= strm->avail_in;
+    out -= strm->avail_out;
+    strm->total_in += in;
+    strm->total_out += out;
+    state->total += out;
+    if ((state->wrap & 4) && out)
+        strm->adler = state->check =
+            UPDATE(state->check, strm->next_out - out, out);
+    strm->data_type = (int)state->bits + (state->last ? 64 : 0) +
+                      (state->mode == TYPE ? 128 : 0) +
+                      (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0);
+    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
+        ret = Z_BUF_ERROR;
+    return ret;
+}
+
+int ZEXPORT inflateEnd(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (state->window != Z_NULL) ZFREE(strm, state->window);
+    ZFREE(strm, strm->state);
+    strm->state = Z_NULL;
+    Tracev((stderr, "inflate: end\n"));
+    return Z_OK;
+}
+
+int ZEXPORT inflateGetDictionary(strm, dictionary, dictLength)
+z_streamp strm;
+Bytef *dictionary;
+uInt *dictLength;
+{
+    struct inflate_state FAR *state;
+
+    /* check state */
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* copy dictionary */
+    if (state->whave && dictionary != Z_NULL) {
+        zmemcpy(dictionary, state->window + state->wnext,
+                state->whave - state->wnext);
+        zmemcpy(dictionary + state->whave - state->wnext,
+                state->window, state->wnext);
+    }
+    if (dictLength != Z_NULL)
+        *dictLength = state->whave;
+    return Z_OK;
+}
+
+int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
+z_streamp strm;
+const Bytef *dictionary;
+uInt dictLength;
+{
+    struct inflate_state FAR *state;
+    unsigned long dictid;
+    int ret;
+
+    /* check state */
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (state->wrap != 0 && state->mode != DICT)
+        return Z_STREAM_ERROR;
+
+    /* check for correct dictionary identifier */
+    if (state->mode == DICT) {
+        dictid = adler32(0L, Z_NULL, 0);
+        dictid = adler32(dictid, dictionary, dictLength);
+        if (dictid != state->check)
+            return Z_DATA_ERROR;
+    }
+
+    /* copy dictionary to window using updatewindow(), which will amend the
+       existing dictionary if appropriate */
+    ret = updatewindow(strm, dictionary + dictLength, dictLength);
+    if (ret) {
+        state->mode = MEM;
+        return Z_MEM_ERROR;
+    }
+    state->havedict = 1;
+    Tracev((stderr, "inflate:   dictionary set\n"));
+    return Z_OK;
+}
+
+int ZEXPORT inflateGetHeader(strm, head)
+z_streamp strm;
+gz_headerp head;
+{
+    struct inflate_state FAR *state;
+
+    /* check state */
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if ((state->wrap & 2) == 0) return Z_STREAM_ERROR;
+
+    /* save header structure */
+    state->head = head;
+    head->done = 0;
+    return Z_OK;
+}
+
+/*
+   Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff.  Return when found
+   or when out of input.  When called, *have is the number of pattern bytes
+   found in order so far, in 0..3.  On return *have is updated to the new
+   state.  If on return *have equals four, then the pattern was found and the
+   return value is how many bytes were read including the last byte of the
+   pattern.  If *have is less than four, then the pattern has not been found
+   yet and the return value is len.  In the latter case, syncsearch() can be
+   called again with more data and the *have state.  *have is initialized to
+   zero for the first call.
+ */
+local unsigned syncsearch(have, buf, len)
+unsigned FAR *have;
+const unsigned char FAR *buf;
+unsigned len;
+{
+    unsigned got;
+    unsigned next;
+
+    got = *have;
+    next = 0;
+    while (next < len && got < 4) {
+        if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
+            got++;
+        else if (buf[next])
+            got = 0;
+        else
+            got = 4 - got;
+        next++;
+    }
+    *have = got;
+    return next;
+}
+
+int ZEXPORT inflateSync(strm)
+z_streamp strm;
+{
+    unsigned len;               /* number of bytes to look at or looked at */
+    unsigned long in, out;      /* temporary to save total_in and total_out */
+    unsigned char buf[4];       /* to restore bit buffer to byte string */
+    struct inflate_state FAR *state;
+
+    /* check parameters */
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
+
+    /* if first time, start search in bit buffer */
+    if (state->mode != SYNC) {
+        state->mode = SYNC;
+        state->hold <<= state->bits & 7;
+        state->bits -= state->bits & 7;
+        len = 0;
+        while (state->bits >= 8) {
+            buf[len++] = (unsigned char)(state->hold);
+            state->hold >>= 8;
+            state->bits -= 8;
+        }
+        state->have = 0;
+        syncsearch(&(state->have), buf, len);
+    }
+
+    /* search available input */
+    len = syncsearch(&(state->have), strm->next_in, strm->avail_in);
+    strm->avail_in -= len;
+    strm->next_in += len;
+    strm->total_in += len;
+
+    /* return no joy or set up to restart inflate() on a new block */
+    if (state->have != 4) return Z_DATA_ERROR;
+    in = strm->total_in;  out = strm->total_out;
+    inflateReset(strm);
+    strm->total_in = in;  strm->total_out = out;
+    state->mode = TYPE;
+    return Z_OK;
+}
+
+/*
+   Returns true if inflate is currently at the end of a block generated by
+   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+   implementation to provide an additional safety check. PPP uses
+   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+   block. When decompressing, PPP checks that at the end of input packet,
+   inflate is waiting for these length bytes.
+ */
+int ZEXPORT inflateSyncPoint(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    return state->mode == STORED && state->bits == 0;
+}
+
+int ZEXPORT inflateCopy(dest, source)
+z_streamp dest;
+z_streamp source;
+{
+    struct inflate_state FAR *state;
+    struct inflate_state FAR *copy;
+    unsigned char FAR *window;
+    unsigned wsize;
+
+    /* check input */
+    if (inflateStateCheck(source) || dest == Z_NULL)
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)source->state;
+
+    /* allocate space */
+    copy = (struct inflate_state FAR *)
+           ZALLOC(source, 1, sizeof(struct inflate_state));
+    if (copy == Z_NULL) return Z_MEM_ERROR;
+    window = Z_NULL;
+    if (state->window != Z_NULL) {
+        window = (unsigned char FAR *)
+                 ZALLOC(source, 1U << state->wbits, sizeof(unsigned char));
+        if (window == Z_NULL) {
+            ZFREE(source, copy);
+            return Z_MEM_ERROR;
+        }
+    }
+
+    /* copy state */
+    zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream));
+    zmemcpy((voidpf)copy, (voidpf)state, sizeof(struct inflate_state));
+    copy->strm = dest;
+    if (state->lencode >= state->codes &&
+        state->lencode <= state->codes + ENOUGH - 1) {
+        copy->lencode = copy->codes + (state->lencode - state->codes);
+        copy->distcode = copy->codes + (state->distcode - state->codes);
+    }
+    copy->next = copy->codes + (state->next - state->codes);
+    if (window != Z_NULL) {
+        wsize = 1U << state->wbits;
+        zmemcpy(window, state->window, wsize);
+    }
+    copy->window = window;
+    dest->state = (struct internal_state FAR *)copy;
+    return Z_OK;
+}
+
+int ZEXPORT inflateUndermine(strm, subvert)
+z_streamp strm;
+int subvert;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+    state->sane = !subvert;
+    return Z_OK;
+#else
+    (void)subvert;
+    state->sane = 1;
+    return Z_DATA_ERROR;
+#endif
+}
+
+int ZEXPORT inflateValidate(strm, check)
+z_streamp strm;
+int check;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (check)
+        state->wrap |= 4;
+    else
+        state->wrap &= ~4;
+    return Z_OK;
+}
+
+long ZEXPORT inflateMark(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+
+    if (inflateStateCheck(strm))
+        return -(1L << 16);
+    state = (struct inflate_state FAR *)strm->state;
+    return (long)(((unsigned long)((long)state->back)) << 16) +
+        (state->mode == COPY ? state->length :
+            (state->mode == MATCH ? state->was - state->length : 0));
+}
+
+unsigned long ZEXPORT inflateCodesUsed(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+    if (inflateStateCheck(strm)) return (unsigned long)-1;
+    state = (struct inflate_state FAR *)strm->state;
+    return (unsigned long)(state->next - state->codes);
+}
diff --git a/deps/SZ/zlib/inflate.h b/deps/SZ/zlib/inflate.h
new file mode 100644
index 0000000000000000000000000000000000000000..a46cce6b6d05ef994d2a386257cf09068f0aa298
--- /dev/null
+++ b/deps/SZ/zlib/inflate.h
@@ -0,0 +1,125 @@
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+   trailer decoding by inflate().  NO_GZIP would be used to avoid linking in
+   the crc code when it is not needed.  For shared libraries, gzip decoding
+   should be left enabled. */
+#ifndef NO_GZIP
+#  define GUNZIP
+#endif
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+    HEAD = 16180,   /* i: waiting for magic header */
+    FLAGS,      /* i: waiting for method and flags (gzip) */
+    TIME,       /* i: waiting for modification time (gzip) */
+    OS,         /* i: waiting for extra flags and operating system (gzip) */
+    EXLEN,      /* i: waiting for extra length (gzip) */
+    EXTRA,      /* i: waiting for extra bytes (gzip) */
+    NAME,       /* i: waiting for end of file name (gzip) */
+    COMMENT,    /* i: waiting for end of comment (gzip) */
+    HCRC,       /* i: waiting for header crc (gzip) */
+    DICTID,     /* i: waiting for dictionary check value */
+    DICT,       /* waiting for inflateSetDictionary() call */
+        TYPE,       /* i: waiting for type bits, including last-flag bit */
+        TYPEDO,     /* i: same, but skip check to exit inflate on new block */
+        STORED,     /* i: waiting for stored size (length and complement) */
+        COPY_,      /* i/o: same as COPY below, but only first time in */
+        COPY,       /* i/o: waiting for input or output to copy stored block */
+        TABLE,      /* i: waiting for dynamic block table lengths */
+        LENLENS,    /* i: waiting for code length code lengths */
+        CODELENS,   /* i: waiting for length/lit and distance code lengths */
+            LEN_,       /* i: same as LEN below, but only first time in */
+            LEN,        /* i: waiting for length/lit/eob code */
+            LENEXT,     /* i: waiting for length extra bits */
+            DIST,       /* i: waiting for distance code */
+            DISTEXT,    /* i: waiting for distance extra bits */
+            MATCH,      /* o: waiting for output space to copy string */
+            LIT,        /* o: waiting for output space to write literal */
+    CHECK,      /* i: waiting for 32-bit check value */
+    LENGTH,     /* i: waiting for 32-bit length (gzip) */
+    DONE,       /* finished check, done -- remain here until reset */
+    BAD,        /* got a data error -- remain here until reset */
+    MEM,        /* got an inflate() memory error -- remain here until reset */
+    SYNC        /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+    State transitions between above modes -
+
+    (most modes can go to BAD or MEM on error -- not shown for clarity)
+
+    Process header:
+        HEAD -> (gzip) or (zlib) or (raw)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT ->
+                  HCRC -> TYPE
+        (zlib) -> DICTID or TYPE
+        DICTID -> DICT -> TYPE
+        (raw) -> TYPEDO
+    Read deflate blocks:
+            TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK
+            STORED -> COPY_ -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN_
+            LEN_ -> LEN
+    Read deflate codes in fixed or dynamic block:
+                LEN -> LENEXT or LIT or TYPE
+                LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+                LIT -> LEN
+    Process trailer:
+        CHECK -> LENGTH -> DONE
+ */
+
+/* State maintained between inflate() calls -- approximately 7K bytes, not
+   including the allocated sliding window, which is up to 32K bytes. */
+struct inflate_state {
+    z_streamp strm;             /* pointer back to this zlib stream */
+    inflate_mode mode;          /* current inflate mode */
+    int last;                   /* true if processing last block */
+    int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip,
+                                   bit 2 true to validate check value */
+    int havedict;               /* true if dictionary provided */
+    int flags;                  /* gzip header method and flags (0 if zlib) */
+    unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
+    unsigned long check;        /* protected copy of check value */
+    unsigned long total;        /* protected copy of output count */
+    gz_headerp head;            /* where to save gzip header information */
+        /* sliding window */
+    unsigned wbits;             /* log base 2 of requested window size */
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned wnext;             /* window write index */
+    unsigned char FAR *window;  /* allocated sliding window, if needed */
+        /* bit accumulator */
+    unsigned long hold;         /* input bit accumulator */
+    unsigned bits;              /* number of bits in "in" */
+        /* for string and stored block copying */
+    unsigned length;            /* literal or length of data to copy */
+    unsigned offset;            /* distance back to copy string from */
+        /* for table and code decoding */
+    unsigned extra;             /* extra bits needed */
+        /* fixed and dynamic code tables */
+    code const FAR *lencode;    /* starting table for length/literal codes */
+    code const FAR *distcode;   /* starting table for distance codes */
+    unsigned lenbits;           /* index bits for lencode */
+    unsigned distbits;          /* index bits for distcode */
+        /* dynamic table building */
+    unsigned ncode;             /* number of code length code lengths */
+    unsigned nlen;              /* number of length code lengths */
+    unsigned ndist;             /* number of distance code lengths */
+    unsigned have;              /* number of code lengths in lens[] */
+    code FAR *next;             /* next available space in codes[] */
+    unsigned short lens[320];   /* temporary storage for code lengths */
+    unsigned short work[288];   /* work area for code table building */
+    code codes[ENOUGH];         /* space for code tables */
+    int sane;                   /* if false, allow invalid distance too far */
+    int back;                   /* bits back of last unprocessed length/lit */
+    unsigned was;               /* initial length of match */
+};
diff --git a/deps/SZ/zlib/inftrees.c b/deps/SZ/zlib/inftrees.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ea08fc13ea8ec50fad1f7574fa287aa6362abc4
--- /dev/null
+++ b/deps/SZ/zlib/inftrees.c
@@ -0,0 +1,304 @@
+/* inftrees.c -- generate Huffman trees for efficient decoding
+ * Copyright (C) 1995-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zutil.h"
+#include "inftrees.h"
+
+#define MAXBITS 15
+
+const char inflate_copyright[] =
+   " inflate 1.2.11 Copyright 1995-2017 Mark Adler ";
+/*
+  If you use the zlib library in a product, an acknowledgment is welcome
+  in the documentation of your product. If for some reason you cannot
+  include such an acknowledgment, I would appreciate that you keep this
+  copyright string in the executable of your product.
+ */
+
+/*
+   Build a set of tables to decode the provided canonical Huffman code.
+   The code lengths are lens[0..codes-1].  The result starts at *table,
+   whose indices are 0..2^bits-1.  work is a writable array of at least
+   lens shorts, which is used as a work area.  type is the type of code
+   to be generated, CODES, LENS, or DISTS.  On return, zero is success,
+   -1 is an invalid code, and +1 means that ENOUGH isn't enough.  table
+   on return points to the next available entry's address.  bits is the
+   requested root table index bits, and on return it is the actual root
+   table index bits.  It will differ if the request is greater than the
+   longest code or if it is less than the shortest code.
+ */
+int ZLIB_INTERNAL inflate_table(type, lens, codes, table, bits, work)
+codetype type;
+unsigned short FAR *lens;
+unsigned codes;
+code FAR * FAR *table;
+unsigned FAR *bits;
+unsigned short FAR *work;
+{
+    unsigned len;               /* a code's length in bits */
+    unsigned sym;               /* index of code symbols */
+    unsigned min, max;          /* minimum and maximum code lengths */
+    unsigned root;              /* number of index bits for root table */
+    unsigned curr;              /* number of index bits for current table */
+    unsigned drop;              /* code bits to drop for sub-table */
+    int left;                   /* number of prefix codes available */
+    unsigned used;              /* code entries in table used */
+    unsigned huff;              /* Huffman code */
+    unsigned incr;              /* for incrementing code, index */
+    unsigned fill;              /* index for replicating entries */
+    unsigned low;               /* low bits for current root entry */
+    unsigned mask;              /* mask for low root bits */
+    code here;                  /* table entry for duplication */
+    code FAR *next;             /* next available space in table */
+    const unsigned short FAR *base;     /* base value table to use */
+    const unsigned short FAR *extra;    /* extra bits table to use */
+    unsigned match;             /* use base and extra for symbol >= match */
+    unsigned short count[MAXBITS+1];    /* number of codes of each length */
+    unsigned short offs[MAXBITS+1];     /* offsets in table for each length */
+    static const unsigned short lbase[31] = { /* Length codes 257..285 base */
+        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+    static const unsigned short lext[31] = { /* Length codes 257..285 extra */
+        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 77, 202};
+    static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
+        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+        8193, 12289, 16385, 24577, 0, 0};
+    static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
+        16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+        23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+        28, 28, 29, 29, 64, 64};
+
+    /*
+       Process a set of code lengths to create a canonical Huffman code.  The
+       code lengths are lens[0..codes-1].  Each length corresponds to the
+       symbols 0..codes-1.  The Huffman code is generated by first sorting the
+       symbols by length from short to long, and retaining the symbol order
+       for codes with equal lengths.  Then the code starts with all zero bits
+       for the first code of the shortest length, and the codes are integer
+       increments for the same length, and zeros are appended as the length
+       increases.  For the deflate format, these bits are stored backwards
+       from their more natural integer increment ordering, and so when the
+       decoding tables are built in the large loop below, the integer codes
+       are incremented backwards.
+
+       This routine assumes, but does not check, that all of the entries in
+       lens[] are in the range 0..MAXBITS.  The caller must assure this.
+       1..MAXBITS is interpreted as that code length.  zero means that that
+       symbol does not occur in this code.
+
+       The codes are sorted by computing a count of codes for each length,
+       creating from that a table of starting indices for each length in the
+       sorted table, and then entering the symbols in order in the sorted
+       table.  The sorted table is work[], with that space being provided by
+       the caller.
+
+       The length counts are used for other purposes as well, i.e. finding
+       the minimum and maximum length codes, determining if there are any
+       codes at all, checking for a valid set of lengths, and looking ahead
+       at length counts to determine sub-table sizes when building the
+       decoding tables.
+     */
+
+    /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+    for (len = 0; len <= MAXBITS; len++)
+        count[len] = 0;
+    for (sym = 0; sym < codes; sym++)
+        count[lens[sym]]++;
+
+    /* bound code lengths, force root to be within code lengths */
+    root = *bits;
+    for (max = MAXBITS; max >= 1; max--)
+        if (count[max] != 0) break;
+    if (root > max) root = max;
+    if (max == 0) {                     /* no symbols to code at all */
+        here.op = (unsigned char)64;    /* invalid code marker */
+        here.bits = (unsigned char)1;
+        here.val = (unsigned short)0;
+        *(*table)++ = here;             /* make a table to force an error */
+        *(*table)++ = here;
+        *bits = 1;
+        return 0;     /* no symbols, but wait for decoding to report error */
+    }
+    for (min = 1; min < max; min++)
+        if (count[min] != 0) break;
+    if (root < min) root = min;
+
+    /* check for an over-subscribed or incomplete set of lengths */
+    left = 1;
+    for (len = 1; len <= MAXBITS; len++) {
+        left <<= 1;
+        left -= count[len];
+        if (left < 0) return -1;        /* over-subscribed */
+    }
+    if (left > 0 && (type == CODES || max != 1))
+        return -1;                      /* incomplete set */
+
+    /* generate offsets into symbol table for each length for sorting */
+    offs[1] = 0;
+    for (len = 1; len < MAXBITS; len++)
+        offs[len + 1] = offs[len] + count[len];
+
+    /* sort symbols by length, by symbol order within each length */
+    for (sym = 0; sym < codes; sym++)
+        if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+
+    /*
+       Create and fill in decoding tables.  In this loop, the table being
+       filled is at next and has curr index bits.  The code being used is huff
+       with length len.  That code is converted to an index by dropping drop
+       bits off of the bottom.  For codes where len is less than drop + curr,
+       those top drop + curr - len bits are incremented through all values to
+       fill the table with replicated entries.
+
+       root is the number of index bits for the root table.  When len exceeds
+       root, sub-tables are created pointed to by the root entry with an index
+       of the low root bits of huff.  This is saved in low to check for when a
+       new sub-table should be started.  drop is zero when the root table is
+       being filled, and drop is root when sub-tables are being filled.
+
+       When a new sub-table is needed, it is necessary to look ahead in the
+       code lengths to determine what size sub-table is needed.  The length
+       counts are used for this, and so count[] is decremented as codes are
+       entered in the tables.
+
+       used keeps track of how many table entries have been allocated from the
+       provided *table space.  It is checked for LENS and DIST tables against
+       the constants ENOUGH_LENS and ENOUGH_DISTS to guard against changes in
+       the initial root table size constants.  See the comments in inftrees.h
+       for more information.
+
+       sym increments through all symbols, and the loop terminates when
+       all codes of length max, i.e. all codes, have been processed.  This
+       routine permits incomplete codes, so another loop after this one fills
+       in the rest of the decoding tables with invalid code markers.
+     */
+
+    /* set up for code type */
+    switch (type) {
+    case CODES:
+        base = extra = work;    /* dummy value--not used */
+        match = 20;
+        break;
+    case LENS:
+        base = lbase;
+        extra = lext;
+        match = 257;
+        break;
+    default:    /* DISTS */
+        base = dbase;
+        extra = dext;
+        match = 0;
+    }
+
+    /* initialize state for loop */
+    huff = 0;                   /* starting code */
+    sym = 0;                    /* starting code symbol */
+    len = min;                  /* starting code length */
+    next = *table;              /* current table to fill in */
+    curr = root;                /* current table index bits */
+    drop = 0;                   /* current bits to drop from code for index */
+    low = (unsigned)(-1);       /* trigger new sub-table when len > root */
+    used = 1U << root;          /* use root table entries */
+    mask = used - 1;            /* mask for comparing low */
+
+    /* check available table space */
+    if ((type == LENS && used > ENOUGH_LENS) ||
+        (type == DISTS && used > ENOUGH_DISTS))
+        return 1;
+
+    /* process all codes and make table entries */
+    for (;;) {
+        /* create table entry */
+        here.bits = (unsigned char)(len - drop);
+        if (work[sym] + 1U < match) {
+            here.op = (unsigned char)0;
+            here.val = work[sym];
+        }
+        else if (work[sym] >= match) {
+            here.op = (unsigned char)(extra[work[sym] - match]);
+            here.val = base[work[sym] - match];
+        }
+        else {
+            here.op = (unsigned char)(32 + 64);         /* end of block */
+            here.val = 0;
+        }
+
+        /* replicate for those indices with low len bits equal to huff */
+        incr = 1U << (len - drop);
+        fill = 1U << curr;
+        min = fill;                 /* save offset to next table */
+        do {
+            fill -= incr;
+            next[(huff >> drop) + fill] = here;
+        } while (fill != 0);
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
+
+        /* go to next symbol, update count, len */
+        sym++;
+        if (--(count[len]) == 0) {
+            if (len == max) break;
+            len = lens[work[sym]];
+        }
+
+        /* create new sub-table if needed */
+        if (len > root && (huff & mask) != low) {
+            /* if first time, transition to sub-tables */
+            if (drop == 0)
+                drop = root;
+
+            /* increment past last table */
+            next += min;            /* here min is 1 << curr */
+
+            /* determine length of next table */
+            curr = len - drop;
+            left = (int)(1 << curr);
+            while (curr + drop < max) {
+                left -= count[curr + drop];
+                if (left <= 0) break;
+                curr++;
+                left <<= 1;
+            }
+
+            /* check for enough space */
+            used += 1U << curr;
+            if ((type == LENS && used > ENOUGH_LENS) ||
+                (type == DISTS && used > ENOUGH_DISTS))
+                return 1;
+
+            /* point entry in root table to sub-table */
+            low = huff & mask;
+            (*table)[low].op = (unsigned char)curr;
+            (*table)[low].bits = (unsigned char)root;
+            (*table)[low].val = (unsigned short)(next - *table);
+        }
+    }
+
+    /* fill in remaining table entry if code is incomplete (guaranteed to have
+       at most one remaining entry, since if the code is incomplete, the
+       maximum code length that was allowed to get this far is one bit) */
+    if (huff != 0) {
+        here.op = (unsigned char)64;            /* invalid code marker */
+        here.bits = (unsigned char)(len - drop);
+        here.val = (unsigned short)0;
+        next[huff] = here;
+    }
+
+    /* set return parameters */
+    *table += used;
+    *bits = root;
+    return 0;
+}
diff --git a/deps/SZ/zlib/inftrees.h b/deps/SZ/zlib/inftrees.h
new file mode 100644
index 0000000000000000000000000000000000000000..baa53a0b1a199ce6ea4c3f99d0306502ab4fab2c
--- /dev/null
+++ b/deps/SZ/zlib/inftrees.h
@@ -0,0 +1,62 @@
+/* inftrees.h -- header to use inftrees.c
+ * Copyright (C) 1995-2005, 2010 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Structure for decoding tables.  Each entry provides either the
+   information needed to do the operation requested by the code that
+   indexed that table entry, or it provides a pointer to another
+   table that indexes more bits of the code.  op indicates whether
+   the entry is a pointer to another table, a literal, a length or
+   distance, an end-of-block, or an invalid code.  For a table
+   pointer, the low four bits of op is the number of index bits of
+   that table.  For a length or distance, the low four bits of op
+   is the number of extra bits to get after the code.  bits is
+   the number of bits in this code or part of the code to drop off
+   of the bit buffer.  val is the actual byte to output in the case
+   of a literal, the base length or distance, or the offset from
+   the current table to the next table.  Each entry is four bytes. */
+typedef struct {
+    unsigned char op;           /* operation, extra bits, table bits */
+    unsigned char bits;         /* bits in this part of the code */
+    unsigned short val;         /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+    00000000 - literal
+    0000tttt - table link, tttt != 0 is the number of table index bits
+    0001eeee - length or distance, eeee is the number of extra bits
+    01100000 - end of block
+    01000000 - invalid code
+ */
+
+/* Maximum size of the dynamic table.  The maximum number of code structures is
+   1444, which is the sum of 852 for literal/length codes and 592 for distance
+   codes.  These values were found by exhaustive searches using the program
+   examples/enough.c found in the zlib distribtution.  The arguments to that
+   program are the number of symbols, the initial root table size, and the
+   maximum bit length of a code.  "enough 286 9 15" for literal/length codes
+   returns returns 852, and "enough 30 6 15" for distance codes returns 592.
+   The initial root table size (9 or 6) is found in the fifth argument of the
+   inflate_table() calls in inflate.c and infback.c.  If the root table size is
+   changed, then these maximum sizes would be need to be recalculated and
+   updated. */
+#define ENOUGH_LENS 852
+#define ENOUGH_DISTS 592
+#define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS)
+
+/* Type of code to build for inflate_table() */
+typedef enum {
+    CODES,
+    LENS,
+    DISTS
+} codetype;
+
+int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens,
+                             unsigned codes, code FAR * FAR *table,
+                             unsigned FAR *bits, unsigned short FAR *work));
diff --git a/deps/SZ/zlib/trees.c b/deps/SZ/zlib/trees.c
new file mode 100644
index 0000000000000000000000000000000000000000..50cf4b4571cfec347ce5891b76fcb6675fcb580d
--- /dev/null
+++ b/deps/SZ/zlib/trees.c
@@ -0,0 +1,1203 @@
+/* trees.c -- output deflated data using Huffman coding
+ * Copyright (C) 1995-2017 Jean-loup Gailly
+ * detect_data_type() function provided freely by Cosmin Truta, 2006
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process uses several Huffman trees. The more
+ *      common source values are represented by shorter bit sequences.
+ *
+ *      Each code tree is stored in a compressed form which is itself
+ * a Huffman encoding of the lengths of all the code strings (in
+ * ascending order by source values).  The actual code strings are
+ * reconstructed from the lengths in the inflate process, as described
+ * in the deflate specification.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"'Deflate' Compressed Data Format Specification".
+ *      Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc
+ *
+ *      Storer, James A.
+ *          Data Compression:  Methods and Theory, pp. 49-50.
+ *          Computer Science Press, 1988.  ISBN 0-7167-8156-5.
+ *
+ *      Sedgewick, R.
+ *          Algorithms, p290.
+ *          Addison-Wesley, 1983. ISBN 0-201-06672-6.
+ */
+
+/* @(#) $Id$ */
+
+/* #define GEN_TREES_H */
+
+#include "deflate.h"
+
+#ifdef ZLIB_DEBUG
+#  include <ctype.h>
+#endif
+
+/* ===========================================================================
+ * Constants
+ */
+
+#define MAX_BL_BITS 7
+/* Bit length codes must not exceed MAX_BL_BITS bits */
+
+#define END_BLOCK 256
+/* end of block literal code */
+
+#define REP_3_6      16
+/* repeat previous bit length 3-6 times (2 bits of repeat count) */
+
+#define REPZ_3_10    17
+/* repeat a zero length 3-10 times  (3 bits of repeat count) */
+
+#define REPZ_11_138  18
+/* repeat a zero length 11-138 times  (7 bits of repeat count) */
+
+local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */
+   = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0};
+
+local const int extra_dbits[D_CODES] /* extra bits for each distance code */
+   = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */
+   = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7};
+
+local const uch bl_order[BL_CODES]
+   = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
+/* The lengths of the bit length codes are sent in order of decreasing
+ * probability, to avoid transmitting the lengths for unused bit length codes.
+ */
+
+/* ===========================================================================
+ * Local data. These are initialized only once.
+ */
+
+#define DIST_CODE_LEN  512 /* see definition of array dist_code below */
+
+#if defined(GEN_TREES_H) || !defined(STDC)
+/* non ANSI compilers may not accept trees.h */
+
+local ct_data static_ltree[L_CODES+2];
+/* The static literal tree. Since the bit lengths are imposed, there is no
+ * need for the L_CODES extra codes used during heap construction. However
+ * The codes 286 and 287 are needed to build a canonical tree (see _tr_init
+ * below).
+ */
+
+local ct_data static_dtree[D_CODES];
+/* The static distance tree. (Actually a trivial tree since all codes use
+ * 5 bits.)
+ */
+
+uch _dist_code[DIST_CODE_LEN];
+/* Distance codes. The first 256 values correspond to the distances
+ * 3 .. 258, the last 256 values correspond to the top 8 bits of
+ * the 15 bit distances.
+ */
+
+uch _length_code[MAX_MATCH-MIN_MATCH+1];
+/* length code for each normalized match length (0 == MIN_MATCH) */
+
+local int base_length[LENGTH_CODES];
+/* First normalized length for each code (0 = MIN_MATCH) */
+
+local int base_dist[D_CODES];
+/* First normalized distance for each code (0 = distance of 1) */
+
+#else
+#  include "trees.h"
+#endif /* GEN_TREES_H */
+
+struct static_tree_desc_s {
+    const ct_data *static_tree;  /* static tree or NULL */
+    const intf *extra_bits;      /* extra bits for each code or NULL */
+    int     extra_base;          /* base index for extra_bits */
+    int     elems;               /* max number of elements in the tree */
+    int     max_length;          /* max bit length for the codes */
+};
+
+local const static_tree_desc  static_l_desc =
+{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
+
+local const static_tree_desc  static_d_desc =
+{static_dtree, extra_dbits, 0,          D_CODES, MAX_BITS};
+
+local const static_tree_desc  static_bl_desc =
+{(const ct_data *)0, extra_blbits, 0,   BL_CODES, MAX_BL_BITS};
+
+/* ===========================================================================
+ * Local (static) routines in this file.
+ */
+
+local void tr_static_init OF((void));
+local void init_block     OF((deflate_state *s));
+local void pqdownheap     OF((deflate_state *s, ct_data *tree, int k));
+local void gen_bitlen     OF((deflate_state *s, tree_desc *desc));
+local void gen_codes      OF((ct_data *tree, int max_code, ushf *bl_count));
+local void build_tree     OF((deflate_state *s, tree_desc *desc));
+local void scan_tree      OF((deflate_state *s, ct_data *tree, int max_code));
+local void send_tree      OF((deflate_state *s, ct_data *tree, int max_code));
+local int  build_bl_tree  OF((deflate_state *s));
+local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes,
+                              int blcodes));
+local void compress_block OF((deflate_state *s, const ct_data *ltree,
+                              const ct_data *dtree));
+local int  detect_data_type OF((deflate_state *s));
+local unsigned bi_reverse OF((unsigned value, int length));
+local void bi_windup      OF((deflate_state *s));
+local void bi_flush       OF((deflate_state *s));
+
+#ifdef GEN_TREES_H
+local void gen_trees_header OF((void));
+#endif
+
+#ifndef ZLIB_DEBUG
+#  define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len)
+   /* Send a code of the given tree. c and tree must not have side effects */
+
+#else /* !ZLIB_DEBUG */
+#  define send_code(s, c, tree) \
+     { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \
+       send_bits(s, tree[c].Code, tree[c].Len); }
+#endif
+
+/* ===========================================================================
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#define put_short(s, w) { \
+    put_byte(s, (uch)((w) & 0xff)); \
+    put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+
+/* ===========================================================================
+ * Send a value on a given number of bits.
+ * IN assertion: length <= 16 and value fits in length bits.
+ */
+#ifdef ZLIB_DEBUG
+local void send_bits      OF((deflate_state *s, int value, int length));
+
+local void send_bits(s, value, length)
+    deflate_state *s;
+    int value;  /* value to send */
+    int length; /* number of bits */
+{
+    Tracevv((stderr," l %2d v %4x ", length, value));
+    Assert(length > 0 && length <= 15, "invalid length");
+    s->bits_sent += (ulg)length;
+
+    /* If not enough room in bi_buf, use (valid) bits from bi_buf and
+     * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
+     * unused bits in value.
+     */
+    if (s->bi_valid > (int)Buf_size - length) {
+        s->bi_buf |= (ush)value << s->bi_valid;
+        put_short(s, s->bi_buf);
+        s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
+        s->bi_valid += length - Buf_size;
+    } else {
+        s->bi_buf |= (ush)value << s->bi_valid;
+        s->bi_valid += length;
+    }
+}
+#else /* !ZLIB_DEBUG */
+
+#define send_bits(s, value, length) \
+{ int len = length;\
+  if (s->bi_valid > (int)Buf_size - len) {\
+    int val = (int)value;\
+    s->bi_buf |= (ush)val << s->bi_valid;\
+    put_short(s, s->bi_buf);\
+    s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
+    s->bi_valid += len - Buf_size;\
+  } else {\
+    s->bi_buf |= (ush)(value) << s->bi_valid;\
+    s->bi_valid += len;\
+  }\
+}
+#endif /* ZLIB_DEBUG */
+
+
+/* the arguments must not have side effects */
+
+/* ===========================================================================
+ * Initialize the various 'constant' tables.
+ */
+local void tr_static_init()
+{
+#if defined(GEN_TREES_H) || !defined(STDC)
+    static int static_init_done = 0;
+    int n;        /* iterates over tree elements */
+    int bits;     /* bit counter */
+    int length;   /* length value */
+    int code;     /* code value */
+    int dist;     /* distance index */
+    ush bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    if (static_init_done) return;
+
+    /* For some embedded targets, global variables are not initialized: */
+#ifdef NO_INIT_GLOBAL_POINTERS
+    static_l_desc.static_tree = static_ltree;
+    static_l_desc.extra_bits = extra_lbits;
+    static_d_desc.static_tree = static_dtree;
+    static_d_desc.extra_bits = extra_dbits;
+    static_bl_desc.extra_bits = extra_blbits;
+#endif
+
+    /* Initialize the mapping length (0..255) -> length code (0..28) */
+    length = 0;
+    for (code = 0; code < LENGTH_CODES-1; code++) {
+        base_length[code] = length;
+        for (n = 0; n < (1<<extra_lbits[code]); n++) {
+            _length_code[length++] = (uch)code;
+        }
+    }
+    Assert (length == 256, "tr_static_init: length != 256");
+    /* Note that the length 255 (match length 258) can be represented
+     * in two different ways: code 284 + 5 bits or code 285, so we
+     * overwrite length_code[255] to use the best encoding:
+     */
+    _length_code[length-1] = (uch)code;
+
+    /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
+    dist = 0;
+    for (code = 0 ; code < 16; code++) {
+        base_dist[code] = dist;
+        for (n = 0; n < (1<<extra_dbits[code]); n++) {
+            _dist_code[dist++] = (uch)code;
+        }
+    }
+    Assert (dist == 256, "tr_static_init: dist != 256");
+    dist >>= 7; /* from now on, all distances are divided by 128 */
+    for ( ; code < D_CODES; code++) {
+        base_dist[code] = dist << 7;
+        for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
+            _dist_code[256 + dist++] = (uch)code;
+        }
+    }
+    Assert (dist == 256, "tr_static_init: 256+dist != 512");
+
+    /* Construct the codes of the static literal tree */
+    for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
+    n = 0;
+    while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++;
+    while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++;
+    while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++;
+    while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++;
+    /* Codes 286 and 287 do not exist, but we must include them in the
+     * tree construction to get a canonical Huffman tree (longest code
+     * all ones)
+     */
+    gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count);
+
+    /* The static distance tree is trivial: */
+    for (n = 0; n < D_CODES; n++) {
+        static_dtree[n].Len = 5;
+        static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+    }
+    static_init_done = 1;
+
+#  ifdef GEN_TREES_H
+    gen_trees_header();
+#  endif
+#endif /* defined(GEN_TREES_H) || !defined(STDC) */
+}
+
+/* ===========================================================================
+ * Genererate the file trees.h describing the static trees.
+ */
+#ifdef GEN_TREES_H
+#  ifndef ZLIB_DEBUG
+#    include <stdio.h>
+#  endif
+
+#  define SEPARATOR(i, last, width) \
+      ((i) == (last)? "\n};\n\n" :    \
+       ((i) % (width) == (width)-1 ? ",\n" : ", "))
+
+void gen_trees_header()
+{
+    FILE *header = fopen("trees.h", "w");
+    int i;
+
+    Assert (header != NULL, "Can't open trees.h");
+    fprintf(header,
+            "/* header created automatically with -DGEN_TREES_H */\n\n");
+
+    fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n");
+    for (i = 0; i < L_CODES+2; i++) {
+        fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code,
+                static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5));
+    }
+
+    fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n");
+    for (i = 0; i < D_CODES; i++) {
+        fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code,
+                static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5));
+    }
+
+    fprintf(header, "const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = {\n");
+    for (i = 0; i < DIST_CODE_LEN; i++) {
+        fprintf(header, "%2u%s", _dist_code[i],
+                SEPARATOR(i, DIST_CODE_LEN-1, 20));
+    }
+
+    fprintf(header,
+        "const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= {\n");
+    for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) {
+        fprintf(header, "%2u%s", _length_code[i],
+                SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20));
+    }
+
+    fprintf(header, "local const int base_length[LENGTH_CODES] = {\n");
+    for (i = 0; i < LENGTH_CODES; i++) {
+        fprintf(header, "%1u%s", base_length[i],
+                SEPARATOR(i, LENGTH_CODES-1, 20));
+    }
+
+    fprintf(header, "local const int base_dist[D_CODES] = {\n");
+    for (i = 0; i < D_CODES; i++) {
+        fprintf(header, "%5u%s", base_dist[i],
+                SEPARATOR(i, D_CODES-1, 10));
+    }
+
+    fclose(header);
+}
+#endif /* GEN_TREES_H */
+
+/* ===========================================================================
+ * Initialize the tree data structures for a new zlib stream.
+ */
+void ZLIB_INTERNAL _tr_init(s)
+    deflate_state *s;
+{
+    tr_static_init();
+
+    s->l_desc.dyn_tree = s->dyn_ltree;
+    s->l_desc.stat_desc = &static_l_desc;
+
+    s->d_desc.dyn_tree = s->dyn_dtree;
+    s->d_desc.stat_desc = &static_d_desc;
+
+    s->bl_desc.dyn_tree = s->bl_tree;
+    s->bl_desc.stat_desc = &static_bl_desc;
+
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+#ifdef ZLIB_DEBUG
+    s->compressed_len = 0L;
+    s->bits_sent = 0L;
+#endif
+
+    /* Initialize the first block of the first file: */
+    init_block(s);
+}
+
+/* ===========================================================================
+ * Initialize a new block.
+ */
+local void init_block(s)
+    deflate_state *s;
+{
+    int n; /* iterates over tree elements */
+
+    /* Initialize the trees. */
+    for (n = 0; n < L_CODES;  n++) s->dyn_ltree[n].Freq = 0;
+    for (n = 0; n < D_CODES;  n++) s->dyn_dtree[n].Freq = 0;
+    for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
+
+    s->dyn_ltree[END_BLOCK].Freq = 1;
+    s->opt_len = s->static_len = 0L;
+    s->last_lit = s->matches = 0;
+}
+
+#define SMALLEST 1
+/* Index within the heap array of least frequent node in the Huffman tree */
+
+
+/* ===========================================================================
+ * Remove the smallest element from the heap and recreate the heap with
+ * one less element. Updates heap and heap_len.
+ */
+#define pqremove(s, tree, top) \
+{\
+    top = s->heap[SMALLEST]; \
+    s->heap[SMALLEST] = s->heap[s->heap_len--]; \
+    pqdownheap(s, tree, SMALLEST); \
+}
+
+/* ===========================================================================
+ * Compares to subtrees, using the tree depth as tie breaker when
+ * the subtrees have equal frequency. This minimizes the worst case length.
+ */
+#define smaller(tree, n, m, depth) \
+   (tree[n].Freq < tree[m].Freq || \
+   (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
+
+/* ===========================================================================
+ * Restore the heap property by moving down the tree starting at node k,
+ * exchanging a node with the smallest of its two sons if necessary, stopping
+ * when the heap property is re-established (each father smaller than its
+ * two sons).
+ */
+local void pqdownheap(s, tree, k)
+    deflate_state *s;
+    ct_data *tree;  /* the tree to restore */
+    int k;               /* node to move down */
+{
+    int v = s->heap[k];
+    int j = k << 1;  /* left son of k */
+    while (j <= s->heap_len) {
+        /* Set j to the smallest of the two sons: */
+        if (j < s->heap_len &&
+            smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
+            j++;
+        }
+        /* Exit if v is smaller than both sons */
+        if (smaller(tree, v, s->heap[j], s->depth)) break;
+
+        /* Exchange v with the smallest son */
+        s->heap[k] = s->heap[j];  k = j;
+
+        /* And continue down the tree, setting j to the left son of k */
+        j <<= 1;
+    }
+    s->heap[k] = v;
+}
+
+/* ===========================================================================
+ * Compute the optimal bit lengths for a tree and update the total bit length
+ * for the current block.
+ * IN assertion: the fields freq and dad are set, heap[heap_max] and
+ *    above are the tree nodes sorted by increasing frequency.
+ * OUT assertions: the field len is set to the optimal bit length, the
+ *     array bl_count contains the frequencies for each bit length.
+ *     The length opt_len is updated; static_len is also updated if stree is
+ *     not null.
+ */
+local void gen_bitlen(s, desc)
+    deflate_state *s;
+    tree_desc *desc;    /* the tree descriptor */
+{
+    ct_data *tree        = desc->dyn_tree;
+    int max_code         = desc->max_code;
+    const ct_data *stree = desc->stat_desc->static_tree;
+    const intf *extra    = desc->stat_desc->extra_bits;
+    int base             = desc->stat_desc->extra_base;
+    int max_length       = desc->stat_desc->max_length;
+    int h;              /* heap index */
+    int n, m;           /* iterate over the tree elements */
+    int bits;           /* bit length */
+    int xbits;          /* extra bits */
+    ush f;              /* frequency */
+    int overflow = 0;   /* number of elements with bit length too large */
+
+    for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0;
+
+    /* In a first pass, compute the optimal bit lengths (which may
+     * overflow in the case of the bit length tree).
+     */
+    tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
+
+    for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
+        n = s->heap[h];
+        bits = tree[tree[n].Dad].Len + 1;
+        if (bits > max_length) bits = max_length, overflow++;
+        tree[n].Len = (ush)bits;
+        /* We overwrite tree[n].Dad which is no longer needed */
+
+        if (n > max_code) continue; /* not a leaf node */
+
+        s->bl_count[bits]++;
+        xbits = 0;
+        if (n >= base) xbits = extra[n-base];
+        f = tree[n].Freq;
+        s->opt_len += (ulg)f * (unsigned)(bits + xbits);
+        if (stree) s->static_len += (ulg)f * (unsigned)(stree[n].Len + xbits);
+    }
+    if (overflow == 0) return;
+
+    Tracev((stderr,"\nbit length overflow\n"));
+    /* This happens for example on obj2 and pic of the Calgary corpus */
+
+    /* Find the first bit length which could increase: */
+    do {
+        bits = max_length-1;
+        while (s->bl_count[bits] == 0) bits--;
+        s->bl_count[bits]--;      /* move one leaf down the tree */
+        s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
+        s->bl_count[max_length]--;
+        /* The brother of the overflow item also moves one step up,
+         * but this does not affect bl_count[max_length]
+         */
+        overflow -= 2;
+    } while (overflow > 0);
+
+    /* Now recompute all bit lengths, scanning in increasing frequency.
+     * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
+     * lengths instead of fixing only the wrong ones. This idea is taken
+     * from 'ar' written by Haruhiko Okumura.)
+     */
+    for (bits = max_length; bits != 0; bits--) {
+        n = s->bl_count[bits];
+        while (n != 0) {
+            m = s->heap[--h];
+            if (m > max_code) continue;
+            if ((unsigned) tree[m].Len != (unsigned) bits) {
+                Tracev((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits));
+                s->opt_len += ((ulg)bits - tree[m].Len) * tree[m].Freq;
+                tree[m].Len = (ush)bits;
+            }
+            n--;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Generate the codes for a given tree and bit counts (which need not be
+ * optimal).
+ * IN assertion: the array bl_count contains the bit length statistics for
+ * the given tree and the field len is set for all tree elements.
+ * OUT assertion: the field code is set for all tree elements of non
+ *     zero code length.
+ */
+local void gen_codes (tree, max_code, bl_count)
+    ct_data *tree;             /* the tree to decorate */
+    int max_code;              /* largest code with non zero frequency */
+    ushf *bl_count;            /* number of codes at each bit length */
+{
+    ush next_code[MAX_BITS+1]; /* next code value for each bit length */
+    unsigned code = 0;         /* running code value */
+    int bits;                  /* bit index */
+    int n;                     /* code index */
+
+    /* The distribution counts are first used to generate the code values
+     * without bit reversal.
+     */
+    for (bits = 1; bits <= MAX_BITS; bits++) {
+        code = (code + bl_count[bits-1]) << 1;
+        next_code[bits] = (ush)code;
+    }
+    /* Check that the bit counts in bl_count are consistent. The last code
+     * must be all ones.
+     */
+    Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
+            "inconsistent bit counts");
+    Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
+
+    for (n = 0;  n <= max_code; n++) {
+        int len = tree[n].Len;
+        if (len == 0) continue;
+        /* Now reverse the bits */
+        tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
+
+        Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
+             n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
+    }
+}
+
+/* ===========================================================================
+ * Construct one Huffman tree and assigns the code bit strings and lengths.
+ * Update the total bit length for the current block.
+ * IN assertion: the field freq is set for all tree elements.
+ * OUT assertions: the fields len and code are set to the optimal bit length
+ *     and corresponding code. The length opt_len is updated; static_len is
+ *     also updated if stree is not null. The field max_code is set.
+ */
+local void build_tree(s, desc)
+    deflate_state *s;
+    tree_desc *desc; /* the tree descriptor */
+{
+    ct_data *tree         = desc->dyn_tree;
+    const ct_data *stree  = desc->stat_desc->static_tree;
+    int elems             = desc->stat_desc->elems;
+    int n, m;          /* iterate over heap elements */
+    int max_code = -1; /* largest code with non zero frequency */
+    int node;          /* new node being created */
+
+    /* Construct the initial heap, with least frequent element in
+     * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
+     * heap[0] is not used.
+     */
+    s->heap_len = 0, s->heap_max = HEAP_SIZE;
+
+    for (n = 0; n < elems; n++) {
+        if (tree[n].Freq != 0) {
+            s->heap[++(s->heap_len)] = max_code = n;
+            s->depth[n] = 0;
+        } else {
+            tree[n].Len = 0;
+        }
+    }
+
+    /* The pkzip format requires that at least one distance code exists,
+     * and that at least one bit should be sent even if there is only one
+     * possible code. So to avoid special checks later on we force at least
+     * two codes of non zero frequency.
+     */
+    while (s->heap_len < 2) {
+        node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0);
+        tree[node].Freq = 1;
+        s->depth[node] = 0;
+        s->opt_len--; if (stree) s->static_len -= stree[node].Len;
+        /* node is 0 or 1 so it does not have extra bits */
+    }
+    desc->max_code = max_code;
+
+    /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
+     * establish sub-heaps of increasing lengths:
+     */
+    for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
+
+    /* Construct the Huffman tree by repeatedly combining the least two
+     * frequent nodes.
+     */
+    node = elems;              /* next internal node of the tree */
+    do {
+        pqremove(s, tree, n);  /* n = node of least frequency */
+        m = s->heap[SMALLEST]; /* m = node of next least frequency */
+
+        s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */
+        s->heap[--(s->heap_max)] = m;
+
+        /* Create a new node father of n and m */
+        tree[node].Freq = tree[n].Freq + tree[m].Freq;
+        s->depth[node] = (uch)((s->depth[n] >= s->depth[m] ?
+                                s->depth[n] : s->depth[m]) + 1);
+        tree[n].Dad = tree[m].Dad = (ush)node;
+#ifdef DUMP_BL_TREE
+        if (tree == s->bl_tree) {
+            fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)",
+                    node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq);
+        }
+#endif
+        /* and insert the new node in the heap */
+        s->heap[SMALLEST] = node++;
+        pqdownheap(s, tree, SMALLEST);
+
+    } while (s->heap_len >= 2);
+
+    s->heap[--(s->heap_max)] = s->heap[SMALLEST];
+
+    /* At this point, the fields freq and dad are set. We can now
+     * generate the bit lengths.
+     */
+    gen_bitlen(s, (tree_desc *)desc);
+
+    /* The field len is now set, we can generate the bit codes */
+    gen_codes ((ct_data *)tree, max_code, s->bl_count);
+}
+
+/* ===========================================================================
+ * Scan a literal or distance tree to determine the frequencies of the codes
+ * in the bit length tree.
+ */
+local void scan_tree (s, tree, max_code)
+    deflate_state *s;
+    ct_data *tree;   /* the tree to be scanned */
+    int max_code;    /* and its largest code of non zero frequency */
+{
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    int count = 0;             /* repeat count of the current code */
+    int max_count = 7;         /* max repeat count */
+    int min_count = 4;         /* min repeat count */
+
+    if (nextlen == 0) max_count = 138, min_count = 3;
+    tree[max_code+1].Len = (ush)0xffff; /* guard */
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen; nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            s->bl_tree[curlen].Freq += count;
+        } else if (curlen != 0) {
+            if (curlen != prevlen) s->bl_tree[curlen].Freq++;
+            s->bl_tree[REP_3_6].Freq++;
+        } else if (count <= 10) {
+            s->bl_tree[REPZ_3_10].Freq++;
+        } else {
+            s->bl_tree[REPZ_11_138].Freq++;
+        }
+        count = 0; prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Send a literal or distance tree in compressed form, using the codes in
+ * bl_tree.
+ */
+local void send_tree (s, tree, max_code)
+    deflate_state *s;
+    ct_data *tree; /* the tree to be scanned */
+    int max_code;       /* and its largest code of non zero frequency */
+{
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    int count = 0;             /* repeat count of the current code */
+    int max_count = 7;         /* max repeat count */
+    int min_count = 4;         /* min repeat count */
+
+    /* tree[max_code+1].Len = -1; */  /* guard already set */
+    if (nextlen == 0) max_count = 138, min_count = 3;
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen; nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            do { send_code(s, curlen, s->bl_tree); } while (--count != 0);
+
+        } else if (curlen != 0) {
+            if (curlen != prevlen) {
+                send_code(s, curlen, s->bl_tree); count--;
+            }
+            Assert(count >= 3 && count <= 6, " 3_6?");
+            send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
+
+        } else if (count <= 10) {
+            send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
+
+        } else {
+            send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
+        }
+        count = 0; prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Construct the Huffman tree for the bit lengths and return the index in
+ * bl_order of the last bit length code to send.
+ */
+local int build_bl_tree(s)
+    deflate_state *s;
+{
+    int max_blindex;  /* index of last bit length code of non zero freq */
+
+    /* Determine the bit length frequencies for literal and distance trees */
+    scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code);
+    scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code);
+
+    /* Build the bit length tree: */
+    build_tree(s, (tree_desc *)(&(s->bl_desc)));
+    /* opt_len now includes the length of the tree representations, except
+     * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
+     */
+
+    /* Determine the number of bit length codes to send. The pkzip format
+     * requires that at least 4 bit length codes be sent. (appnote.txt says
+     * 3 but the actual value used is 4.)
+     */
+    for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
+        if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
+    }
+    /* Update opt_len to include the bit length tree and counts */
+    s->opt_len += 3*((ulg)max_blindex+1) + 5+5+4;
+    Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld",
+            s->opt_len, s->static_len));
+
+    return max_blindex;
+}
+
+/* ===========================================================================
+ * Send the header for a block using dynamic Huffman trees: the counts, the
+ * lengths of the bit length codes, the literal tree and the distance tree.
+ * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
+ */
+local void send_all_trees(s, lcodes, dcodes, blcodes)
+    deflate_state *s;
+    int lcodes, dcodes, blcodes; /* number of codes for each tree */
+{
+    int rank;                    /* index in bl_order */
+
+    Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
+    Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
+            "too many codes");
+    Tracev((stderr, "\nbl counts: "));
+    send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
+    send_bits(s, dcodes-1,   5);
+    send_bits(s, blcodes-4,  4); /* not -3 as stated in appnote.txt */
+    for (rank = 0; rank < blcodes; rank++) {
+        Tracev((stderr, "\nbl code %2d ", bl_order[rank]));
+        send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
+    }
+    Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent));
+
+    send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
+    Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent));
+
+    send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
+    Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent));
+}
+
+/* ===========================================================================
+ * Send a stored block
+ */
+void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
+    deflate_state *s;
+    charf *buf;       /* input block */
+    ulg stored_len;   /* length of input block */
+    int last;         /* one if this is the last block for a file */
+{
+    send_bits(s, (STORED_BLOCK<<1)+last, 3);    /* send block type */
+    bi_windup(s);        /* align on byte boundary */
+    put_short(s, (ush)stored_len);
+    put_short(s, (ush)~stored_len);
+    zmemcpy(s->pending_buf + s->pending, (Bytef *)buf, stored_len);
+    s->pending += stored_len;
+#ifdef ZLIB_DEBUG
+    s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
+    s->compressed_len += (stored_len + 4) << 3;
+    s->bits_sent += 2*16;
+    s->bits_sent += stored_len<<3;
+#endif
+}
+
+/* ===========================================================================
+ * Flush the bits in the bit buffer to pending output (leaves at most 7 bits)
+ */
+void ZLIB_INTERNAL _tr_flush_bits(s)
+    deflate_state *s;
+{
+    bi_flush(s);
+}
+
+/* ===========================================================================
+ * Send one empty static block to give enough lookahead for inflate.
+ * This takes 10 bits, of which 7 may remain in the bit buffer.
+ */
+void ZLIB_INTERNAL _tr_align(s)
+    deflate_state *s;
+{
+    send_bits(s, STATIC_TREES<<1, 3);
+    send_code(s, END_BLOCK, static_ltree);
+#ifdef ZLIB_DEBUG
+    s->compressed_len += 10L; /* 3 for block type, 7 for EOB */
+#endif
+    bi_flush(s);
+}
+
+/* ===========================================================================
+ * Determine the best encoding for the current block: dynamic trees, static
+ * trees or store, and write out the encoded block.
+ */
+void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
+    deflate_state *s;
+    charf *buf;       /* input block, or NULL if too old */
+    ulg stored_len;   /* length of input block */
+    int last;         /* one if this is the last block for a file */
+{
+    ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
+    int max_blindex = 0;  /* index of last bit length code of non zero freq */
+
+    /* Build the Huffman trees unless a stored block is forced */
+    if (s->level > 0) {
+
+        /* Check if the file is binary or text */
+        if (s->strm->data_type == Z_UNKNOWN)
+            s->strm->data_type = detect_data_type(s);
+
+        /* Construct the literal and distance trees */
+        build_tree(s, (tree_desc *)(&(s->l_desc)));
+        Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len,
+                s->static_len));
+
+        build_tree(s, (tree_desc *)(&(s->d_desc)));
+        Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len,
+                s->static_len));
+        /* At this point, opt_len and static_len are the total bit lengths of
+         * the compressed block data, excluding the tree representations.
+         */
+
+        /* Build the bit length tree for the above two trees, and get the index
+         * in bl_order of the last bit length code to send.
+         */
+        max_blindex = build_bl_tree(s);
+
+        /* Determine the best encoding. Compute the block lengths in bytes. */
+        opt_lenb = (s->opt_len+3+7)>>3;
+        static_lenb = (s->static_len+3+7)>>3;
+
+        Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
+                opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
+                s->last_lit));
+
+        if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
+
+    } else {
+        Assert(buf != (char*)0, "lost buf");
+        opt_lenb = static_lenb = stored_len + 5; /* force a stored block */
+    }
+
+#ifdef FORCE_STORED
+    if (buf != (char*)0) { /* force stored block */
+#else
+    if (stored_len+4 <= opt_lenb && buf != (char*)0) {
+                       /* 4: two words for the lengths */
+#endif
+        /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
+         * Otherwise we can't have processed more than WSIZE input bytes since
+         * the last block flush, because compression would have been
+         * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
+         * transform a block into a stored block.
+         */
+        _tr_stored_block(s, buf, stored_len, last);
+
+#ifdef FORCE_STATIC
+    } else if (static_lenb >= 0) { /* force static trees */
+#else
+    } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
+#endif
+        send_bits(s, (STATIC_TREES<<1)+last, 3);
+        compress_block(s, (const ct_data *)static_ltree,
+                       (const ct_data *)static_dtree);
+#ifdef ZLIB_DEBUG
+        s->compressed_len += 3 + s->static_len;
+#endif
+    } else {
+        send_bits(s, (DYN_TREES<<1)+last, 3);
+        send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
+                       max_blindex+1);
+        compress_block(s, (const ct_data *)s->dyn_ltree,
+                       (const ct_data *)s->dyn_dtree);
+#ifdef ZLIB_DEBUG
+        s->compressed_len += 3 + s->opt_len;
+#endif
+    }
+    Assert (s->compressed_len == s->bits_sent, "bad compressed size");
+    /* The above check is made mod 2^32, for files larger than 512 MB
+     * and uLong implemented on 32 bits.
+     */
+    init_block(s);
+
+    if (last) {
+        bi_windup(s);
+#ifdef ZLIB_DEBUG
+        s->compressed_len += 7;  /* align on byte boundary */
+#endif
+    }
+    Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3,
+           s->compressed_len-7*last));
+}
+
+/* ===========================================================================
+ * Save the match info and tally the frequency counts. Return true if
+ * the current block must be flushed.
+ */
+int ZLIB_INTERNAL _tr_tally (s, dist, lc)
+    deflate_state *s;
+    unsigned dist;  /* distance of matched string */
+    unsigned lc;    /* match length-MIN_MATCH or unmatched char (if dist==0) */
+{
+    s->d_buf[s->last_lit] = (ush)dist;
+    s->l_buf[s->last_lit++] = (uch)lc;
+    if (dist == 0) {
+        /* lc is the unmatched char */
+        s->dyn_ltree[lc].Freq++;
+    } else {
+        s->matches++;
+        /* Here, lc is the match length - MIN_MATCH */
+        dist--;             /* dist = match distance - 1 */
+        Assert((ush)dist < (ush)MAX_DIST(s) &&
+               (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
+               (ush)d_code(dist) < (ush)D_CODES,  "_tr_tally: bad match");
+
+        s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
+        s->dyn_dtree[d_code(dist)].Freq++;
+    }
+
+#ifdef TRUNCATE_BLOCK
+    /* Try to guess if it is profitable to stop the current block here */
+    if ((s->last_lit & 0x1fff) == 0 && s->level > 2) {
+        /* Compute an upper bound for the compressed length */
+        ulg out_length = (ulg)s->last_lit*8L;
+        ulg in_length = (ulg)((long)s->strstart - s->block_start);
+        int dcode;
+        for (dcode = 0; dcode < D_CODES; dcode++) {
+            out_length += (ulg)s->dyn_dtree[dcode].Freq *
+                (5L+extra_dbits[dcode]);
+        }
+        out_length >>= 3;
+        Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ",
+               s->last_lit, in_length, out_length,
+               100L - out_length*100L/in_length));
+        if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1;
+    }
+#endif
+    return (s->last_lit == s->lit_bufsize-1);
+    /* We avoid equality with lit_bufsize because of wraparound at 64K
+     * on 16 bit machines and because stored blocks are restricted to
+     * 64K-1 bytes.
+     */
+}
+
+/* ===========================================================================
+ * Send the block data compressed using the given Huffman trees
+ */
+local void compress_block(s, ltree, dtree)
+    deflate_state *s;
+    const ct_data *ltree; /* literal tree */
+    const ct_data *dtree; /* distance tree */
+{
+    unsigned dist;      /* distance of matched string */
+    int lc;             /* match length or unmatched char (if dist == 0) */
+    unsigned lx = 0;    /* running index in l_buf */
+    unsigned code;      /* the code to send */
+    int extra;          /* number of extra bits to send */
+
+    if (s->last_lit != 0) do {
+        dist = s->d_buf[lx];
+        lc = s->l_buf[lx++];
+        if (dist == 0) {
+            send_code(s, lc, ltree); /* send a literal byte */
+            Tracecv(isgraph(lc), (stderr," '%c' ", lc));
+        } else {
+            /* Here, lc is the match length - MIN_MATCH */
+            code = _length_code[lc];
+            send_code(s, code+LITERALS+1, ltree); /* send the length code */
+            extra = extra_lbits[code];
+            if (extra != 0) {
+                lc -= base_length[code];
+                send_bits(s, lc, extra);       /* send the extra length bits */
+            }
+            dist--; /* dist is now the match distance - 1 */
+            code = d_code(dist);
+            Assert (code < D_CODES, "bad d_code");
+
+            send_code(s, code, dtree);       /* send the distance code */
+            extra = extra_dbits[code];
+            if (extra != 0) {
+                dist -= (unsigned)base_dist[code];
+                send_bits(s, dist, extra);   /* send the extra distance bits */
+            }
+        } /* literal or match pair ? */
+
+        /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */
+        Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx,
+               "pendingBuf overflow");
+
+    } while (lx < s->last_lit);
+
+    send_code(s, END_BLOCK, ltree);
+}
+
+/* ===========================================================================
+ * Check if the data type is TEXT or BINARY, using the following algorithm:
+ * - TEXT if the two conditions below are satisfied:
+ *    a) There are no non-portable control characters belonging to the
+ *       "black list" (0..6, 14..25, 28..31).
+ *    b) There is at least one printable character belonging to the
+ *       "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
+ * - BINARY otherwise.
+ * - The following partially-portable control characters form a
+ *   "gray list" that is ignored in this detection algorithm:
+ *   (7 {BEL}, 8 {BS}, 11 {VT}, 12 {FF}, 26 {SUB}, 27 {ESC}).
+ * IN assertion: the fields Freq of dyn_ltree are set.
+ */
+local int detect_data_type(s)
+    deflate_state *s;
+{
+    /* black_mask is the bit mask of black-listed bytes
+     * set bits 0..6, 14..25, and 28..31
+     * 0xf3ffc07f = binary 11110011111111111100000001111111
+     */
+    unsigned long black_mask = 0xf3ffc07fUL;
+    int n;
+
+    /* Check for non-textual ("black-listed") bytes. */
+    for (n = 0; n <= 31; n++, black_mask >>= 1)
+        if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0))
+            return Z_BINARY;
+
+    /* Check for textual ("white-listed") bytes. */
+    if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
+            || s->dyn_ltree[13].Freq != 0)
+        return Z_TEXT;
+    for (n = 32; n < LITERALS; n++)
+        if (s->dyn_ltree[n].Freq != 0)
+            return Z_TEXT;
+
+    /* There are no "black-listed" or "white-listed" bytes:
+     * this stream either is empty or has tolerated ("gray-listed") bytes only.
+     */
+    return Z_BINARY;
+}
+
+/* ===========================================================================
+ * Reverse the first len bits of a code, using straightforward code (a faster
+ * method would use a table)
+ * IN assertion: 1 <= len <= 15
+ */
+local unsigned bi_reverse(code, len)
+    unsigned code; /* the value to invert */
+    int len;       /* its bit length */
+{
+    register unsigned res = 0;
+    do {
+        res |= code & 1;
+        code >>= 1, res <<= 1;
+    } while (--len > 0);
+    return res >> 1;
+}
+
+/* ===========================================================================
+ * Flush the bit buffer, keeping at most 7 bits in it.
+ */
+local void bi_flush(s)
+    deflate_state *s;
+{
+    if (s->bi_valid == 16) {
+        put_short(s, s->bi_buf);
+        s->bi_buf = 0;
+        s->bi_valid = 0;
+    } else if (s->bi_valid >= 8) {
+        put_byte(s, (Byte)s->bi_buf);
+        s->bi_buf >>= 8;
+        s->bi_valid -= 8;
+    }
+}
+
+/* ===========================================================================
+ * Flush the bit buffer and align the output on a byte boundary
+ */
+local void bi_windup(s)
+    deflate_state *s;
+{
+    if (s->bi_valid > 8) {
+        put_short(s, s->bi_buf);
+    } else if (s->bi_valid > 0) {
+        put_byte(s, (Byte)s->bi_buf);
+    }
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+#ifdef ZLIB_DEBUG
+    s->bits_sent = (s->bits_sent+7) & ~7;
+#endif
+}
diff --git a/deps/SZ/zlib/trees.h b/deps/SZ/zlib/trees.h
new file mode 100644
index 0000000000000000000000000000000000000000..d35639d82a27807e49ea35c334f8bbcf64720f82
--- /dev/null
+++ b/deps/SZ/zlib/trees.h
@@ -0,0 +1,128 @@
+/* header created automatically with -DGEN_TREES_H */
+
+local const ct_data static_ltree[L_CODES+2] = {
+{{ 12},{  8}}, {{140},{  8}}, {{ 76},{  8}}, {{204},{  8}}, {{ 44},{  8}},
+{{172},{  8}}, {{108},{  8}}, {{236},{  8}}, {{ 28},{  8}}, {{156},{  8}},
+{{ 92},{  8}}, {{220},{  8}}, {{ 60},{  8}}, {{188},{  8}}, {{124},{  8}},
+{{252},{  8}}, {{  2},{  8}}, {{130},{  8}}, {{ 66},{  8}}, {{194},{  8}},
+{{ 34},{  8}}, {{162},{  8}}, {{ 98},{  8}}, {{226},{  8}}, {{ 18},{  8}},
+{{146},{  8}}, {{ 82},{  8}}, {{210},{  8}}, {{ 50},{  8}}, {{178},{  8}},
+{{114},{  8}}, {{242},{  8}}, {{ 10},{  8}}, {{138},{  8}}, {{ 74},{  8}},
+{{202},{  8}}, {{ 42},{  8}}, {{170},{  8}}, {{106},{  8}}, {{234},{  8}},
+{{ 26},{  8}}, {{154},{  8}}, {{ 90},{  8}}, {{218},{  8}}, {{ 58},{  8}},
+{{186},{  8}}, {{122},{  8}}, {{250},{  8}}, {{  6},{  8}}, {{134},{  8}},
+{{ 70},{  8}}, {{198},{  8}}, {{ 38},{  8}}, {{166},{  8}}, {{102},{  8}},
+{{230},{  8}}, {{ 22},{  8}}, {{150},{  8}}, {{ 86},{  8}}, {{214},{  8}},
+{{ 54},{  8}}, {{182},{  8}}, {{118},{  8}}, {{246},{  8}}, {{ 14},{  8}},
+{{142},{  8}}, {{ 78},{  8}}, {{206},{  8}}, {{ 46},{  8}}, {{174},{  8}},
+{{110},{  8}}, {{238},{  8}}, {{ 30},{  8}}, {{158},{  8}}, {{ 94},{  8}},
+{{222},{  8}}, {{ 62},{  8}}, {{190},{  8}}, {{126},{  8}}, {{254},{  8}},
+{{  1},{  8}}, {{129},{  8}}, {{ 65},{  8}}, {{193},{  8}}, {{ 33},{  8}},
+{{161},{  8}}, {{ 97},{  8}}, {{225},{  8}}, {{ 17},{  8}}, {{145},{  8}},
+{{ 81},{  8}}, {{209},{  8}}, {{ 49},{  8}}, {{177},{  8}}, {{113},{  8}},
+{{241},{  8}}, {{  9},{  8}}, {{137},{  8}}, {{ 73},{  8}}, {{201},{  8}},
+{{ 41},{  8}}, {{169},{  8}}, {{105},{  8}}, {{233},{  8}}, {{ 25},{  8}},
+{{153},{  8}}, {{ 89},{  8}}, {{217},{  8}}, {{ 57},{  8}}, {{185},{  8}},
+{{121},{  8}}, {{249},{  8}}, {{  5},{  8}}, {{133},{  8}}, {{ 69},{  8}},
+{{197},{  8}}, {{ 37},{  8}}, {{165},{  8}}, {{101},{  8}}, {{229},{  8}},
+{{ 21},{  8}}, {{149},{  8}}, {{ 85},{  8}}, {{213},{  8}}, {{ 53},{  8}},
+{{181},{  8}}, {{117},{  8}}, {{245},{  8}}, {{ 13},{  8}}, {{141},{  8}},
+{{ 77},{  8}}, {{205},{  8}}, {{ 45},{  8}}, {{173},{  8}}, {{109},{  8}},
+{{237},{  8}}, {{ 29},{  8}}, {{157},{  8}}, {{ 93},{  8}}, {{221},{  8}},
+{{ 61},{  8}}, {{189},{  8}}, {{125},{  8}}, {{253},{  8}}, {{ 19},{  9}},
+{{275},{  9}}, {{147},{  9}}, {{403},{  9}}, {{ 83},{  9}}, {{339},{  9}},
+{{211},{  9}}, {{467},{  9}}, {{ 51},{  9}}, {{307},{  9}}, {{179},{  9}},
+{{435},{  9}}, {{115},{  9}}, {{371},{  9}}, {{243},{  9}}, {{499},{  9}},
+{{ 11},{  9}}, {{267},{  9}}, {{139},{  9}}, {{395},{  9}}, {{ 75},{  9}},
+{{331},{  9}}, {{203},{  9}}, {{459},{  9}}, {{ 43},{  9}}, {{299},{  9}},
+{{171},{  9}}, {{427},{  9}}, {{107},{  9}}, {{363},{  9}}, {{235},{  9}},
+{{491},{  9}}, {{ 27},{  9}}, {{283},{  9}}, {{155},{  9}}, {{411},{  9}},
+{{ 91},{  9}}, {{347},{  9}}, {{219},{  9}}, {{475},{  9}}, {{ 59},{  9}},
+{{315},{  9}}, {{187},{  9}}, {{443},{  9}}, {{123},{  9}}, {{379},{  9}},
+{{251},{  9}}, {{507},{  9}}, {{  7},{  9}}, {{263},{  9}}, {{135},{  9}},
+{{391},{  9}}, {{ 71},{  9}}, {{327},{  9}}, {{199},{  9}}, {{455},{  9}},
+{{ 39},{  9}}, {{295},{  9}}, {{167},{  9}}, {{423},{  9}}, {{103},{  9}},
+{{359},{  9}}, {{231},{  9}}, {{487},{  9}}, {{ 23},{  9}}, {{279},{  9}},
+{{151},{  9}}, {{407},{  9}}, {{ 87},{  9}}, {{343},{  9}}, {{215},{  9}},
+{{471},{  9}}, {{ 55},{  9}}, {{311},{  9}}, {{183},{  9}}, {{439},{  9}},
+{{119},{  9}}, {{375},{  9}}, {{247},{  9}}, {{503},{  9}}, {{ 15},{  9}},
+{{271},{  9}}, {{143},{  9}}, {{399},{  9}}, {{ 79},{  9}}, {{335},{  9}},
+{{207},{  9}}, {{463},{  9}}, {{ 47},{  9}}, {{303},{  9}}, {{175},{  9}},
+{{431},{  9}}, {{111},{  9}}, {{367},{  9}}, {{239},{  9}}, {{495},{  9}},
+{{ 31},{  9}}, {{287},{  9}}, {{159},{  9}}, {{415},{  9}}, {{ 95},{  9}},
+{{351},{  9}}, {{223},{  9}}, {{479},{  9}}, {{ 63},{  9}}, {{319},{  9}},
+{{191},{  9}}, {{447},{  9}}, {{127},{  9}}, {{383},{  9}}, {{255},{  9}},
+{{511},{  9}}, {{  0},{  7}}, {{ 64},{  7}}, {{ 32},{  7}}, {{ 96},{  7}},
+{{ 16},{  7}}, {{ 80},{  7}}, {{ 48},{  7}}, {{112},{  7}}, {{  8},{  7}},
+{{ 72},{  7}}, {{ 40},{  7}}, {{104},{  7}}, {{ 24},{  7}}, {{ 88},{  7}},
+{{ 56},{  7}}, {{120},{  7}}, {{  4},{  7}}, {{ 68},{  7}}, {{ 36},{  7}},
+{{100},{  7}}, {{ 20},{  7}}, {{ 84},{  7}}, {{ 52},{  7}}, {{116},{  7}},
+{{  3},{  8}}, {{131},{  8}}, {{ 67},{  8}}, {{195},{  8}}, {{ 35},{  8}},
+{{163},{  8}}, {{ 99},{  8}}, {{227},{  8}}
+};
+
+local const ct_data static_dtree[D_CODES] = {
+{{ 0},{ 5}}, {{16},{ 5}}, {{ 8},{ 5}}, {{24},{ 5}}, {{ 4},{ 5}},
+{{20},{ 5}}, {{12},{ 5}}, {{28},{ 5}}, {{ 2},{ 5}}, {{18},{ 5}},
+{{10},{ 5}}, {{26},{ 5}}, {{ 6},{ 5}}, {{22},{ 5}}, {{14},{ 5}},
+{{30},{ 5}}, {{ 1},{ 5}}, {{17},{ 5}}, {{ 9},{ 5}}, {{25},{ 5}},
+{{ 5},{ 5}}, {{21},{ 5}}, {{13},{ 5}}, {{29},{ 5}}, {{ 3},{ 5}},
+{{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}}
+};
+
+const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = {
+ 0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
+ 8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
+10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
+13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  0,  0, 16, 17,
+18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
+};
+
+const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= {
+ 0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12,
+13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28
+};
+
+local const int base_length[LENGTH_CODES] = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+64, 80, 96, 112, 128, 160, 192, 224, 0
+};
+
+local const int base_dist[D_CODES] = {
+    0,     1,     2,     3,     4,     6,     8,    12,    16,    24,
+   32,    48,    64,    96,   128,   192,   256,   384,   512,   768,
+ 1024,  1536,  2048,  3072,  4096,  6144,  8192, 12288, 16384, 24576
+};
+
diff --git a/deps/SZ/zlib/uncompr.c b/deps/SZ/zlib/uncompr.c
new file mode 100644
index 0000000000000000000000000000000000000000..f03a1a865e347d10ac16f6a70b2bc2fdc5235f9c
--- /dev/null
+++ b/deps/SZ/zlib/uncompr.c
@@ -0,0 +1,93 @@
+/* uncompr.c -- decompress a memory buffer
+ * Copyright (C) 1995-2003, 2010, 2014, 2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#define ZLIB_INTERNAL
+#include "zlib.h"
+
+/* ===========================================================================
+     Decompresses the source buffer into the destination buffer.  *sourceLen is
+   the byte length of the source buffer. Upon entry, *destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data. (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit,
+   *destLen is the size of the decompressed data and *sourceLen is the number
+   of source bytes consumed. Upon return, source + *sourceLen points to the
+   first unused input byte.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer, or
+   Z_DATA_ERROR if the input data was corrupted, including if the input data is
+   an incomplete zlib stream.
+*/
+int ZEXPORT uncompress2 (dest, destLen, source, sourceLen)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong *sourceLen;
+{
+    z_stream stream;
+    int err;
+    const uInt max = (uInt)-1;
+    uLong len, left;
+    Byte buf[1];    /* for detection of incomplete stream when *destLen == 0 */
+
+    len = *sourceLen;
+    if (*destLen) {
+        left = *destLen;
+        *destLen = 0;
+    }
+    else {
+        left = 1;
+        dest = buf;
+    }
+
+    stream.next_in = (z_const Bytef *)source;
+    stream.avail_in = 0;
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    err = inflateInit(&stream);
+    if (err != Z_OK) return err;
+
+    stream.next_out = dest;
+    stream.avail_out = 0;
+
+    do {
+        if (stream.avail_out == 0) {
+            stream.avail_out = left > (uLong)max ? max : (uInt)left;
+            left -= stream.avail_out;
+        }
+        if (stream.avail_in == 0) {
+            stream.avail_in = len > (uLong)max ? max : (uInt)len;
+            len -= stream.avail_in;
+        }
+        err = inflate(&stream, Z_NO_FLUSH);
+    } while (err == Z_OK);
+
+    *sourceLen -= len + stream.avail_in;
+    if (dest != buf)
+        *destLen = stream.total_out;
+    else if (stream.total_out && err == Z_BUF_ERROR)
+        left = 1;
+
+    inflateEnd(&stream);
+    return err == Z_STREAM_END ? Z_OK :
+           err == Z_NEED_DICT ? Z_DATA_ERROR  :
+           err == Z_BUF_ERROR && left + stream.avail_out ? Z_DATA_ERROR :
+           err;
+}
+
+int ZEXPORT uncompress (dest, destLen, source, sourceLen)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong sourceLen;
+{
+    return uncompress2(dest, destLen, source, &sourceLen);
+}
diff --git a/deps/SZ/zlib/zconf.h b/deps/SZ/zlib/zconf.h
new file mode 100644
index 0000000000000000000000000000000000000000..77398c11a1e2c5cd2262559cc4c9d033d56b22eb
--- /dev/null
+++ b/deps/SZ/zlib/zconf.h
@@ -0,0 +1,534 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
+ */
+#ifdef Z_PREFIX     /* may be set to #if 1 by ./configure */
+#  define Z_PREFIX_SET
+
+/* all linked symbols and init macros */
+#  define _dist_code            z__dist_code
+#  define _length_code          z__length_code
+#  define _tr_align             z__tr_align
+#  define _tr_flush_bits        z__tr_flush_bits
+#  define _tr_flush_block       z__tr_flush_block
+#  define _tr_init              z__tr_init
+#  define _tr_stored_block      z__tr_stored_block
+#  define _tr_tally             z__tr_tally
+#  define adler32               z_adler32
+#  define adler32_combine       z_adler32_combine
+#  define adler32_combine64     z_adler32_combine64
+#  define adler32_z             z_adler32_z
+#  ifndef Z_SOLO
+#    define compress              z_compress
+#    define compress2             z_compress2
+#    define compressBound         z_compressBound
+#  endif
+#  define crc32                 z_crc32
+#  define crc32_combine         z_crc32_combine
+#  define crc32_combine64       z_crc32_combine64
+#  define crc32_z               z_crc32_z
+#  define deflate               z_deflate
+#  define deflateBound          z_deflateBound
+#  define deflateCopy           z_deflateCopy
+#  define deflateEnd            z_deflateEnd
+#  define deflateGetDictionary  z_deflateGetDictionary
+#  define deflateInit           z_deflateInit
+#  define deflateInit2          z_deflateInit2
+#  define deflateInit2_         z_deflateInit2_
+#  define deflateInit_          z_deflateInit_
+#  define deflateParams         z_deflateParams
+#  define deflatePending        z_deflatePending
+#  define deflatePrime          z_deflatePrime
+#  define deflateReset          z_deflateReset
+#  define deflateResetKeep      z_deflateResetKeep
+#  define deflateSetDictionary  z_deflateSetDictionary
+#  define deflateSetHeader      z_deflateSetHeader
+#  define deflateTune           z_deflateTune
+#  define deflate_copyright     z_deflate_copyright
+#  define get_crc_table         z_get_crc_table
+#  ifndef Z_SOLO
+#    define gz_error              z_gz_error
+#    define gz_intmax             z_gz_intmax
+#    define gz_strwinerror        z_gz_strwinerror
+#    define gzbuffer              z_gzbuffer
+#    define gzclearerr            z_gzclearerr
+#    define gzclose               z_gzclose
+#    define gzclose_r             z_gzclose_r
+#    define gzclose_w             z_gzclose_w
+#    define gzdirect              z_gzdirect
+#    define gzdopen               z_gzdopen
+#    define gzeof                 z_gzeof
+#    define gzerror               z_gzerror
+#    define gzflush               z_gzflush
+#    define gzfread               z_gzfread
+#    define gzfwrite              z_gzfwrite
+#    define gzgetc                z_gzgetc
+#    define gzgetc_               z_gzgetc_
+#    define gzgets                z_gzgets
+#    define gzoffset              z_gzoffset
+#    define gzoffset64            z_gzoffset64
+#    define gzopen                z_gzopen
+#    define gzopen64              z_gzopen64
+#    ifdef _WIN32
+#      define gzopen_w              z_gzopen_w
+#    endif
+#    define gzprintf              z_gzprintf
+#    define gzputc                z_gzputc
+#    define gzputs                z_gzputs
+#    define gzread                z_gzread
+#    define gzrewind              z_gzrewind
+#    define gzseek                z_gzseek
+#    define gzseek64              z_gzseek64
+#    define gzsetparams           z_gzsetparams
+#    define gztell                z_gztell
+#    define gztell64              z_gztell64
+#    define gzungetc              z_gzungetc
+#    define gzvprintf             z_gzvprintf
+#    define gzwrite               z_gzwrite
+#  endif
+#  define inflate               z_inflate
+#  define inflateBack           z_inflateBack
+#  define inflateBackEnd        z_inflateBackEnd
+#  define inflateBackInit       z_inflateBackInit
+#  define inflateBackInit_      z_inflateBackInit_
+#  define inflateCodesUsed      z_inflateCodesUsed
+#  define inflateCopy           z_inflateCopy
+#  define inflateEnd            z_inflateEnd
+#  define inflateGetDictionary  z_inflateGetDictionary
+#  define inflateGetHeader      z_inflateGetHeader
+#  define inflateInit           z_inflateInit
+#  define inflateInit2          z_inflateInit2
+#  define inflateInit2_         z_inflateInit2_
+#  define inflateInit_          z_inflateInit_
+#  define inflateMark           z_inflateMark
+#  define inflatePrime          z_inflatePrime
+#  define inflateReset          z_inflateReset
+#  define inflateReset2         z_inflateReset2
+#  define inflateResetKeep      z_inflateResetKeep
+#  define inflateSetDictionary  z_inflateSetDictionary
+#  define inflateSync           z_inflateSync
+#  define inflateSyncPoint      z_inflateSyncPoint
+#  define inflateUndermine      z_inflateUndermine
+#  define inflateValidate       z_inflateValidate
+#  define inflate_copyright     z_inflate_copyright
+#  define inflate_fast          z_inflate_fast
+#  define inflate_table         z_inflate_table
+#  ifndef Z_SOLO
+#    define uncompress            z_uncompress
+#    define uncompress2           z_uncompress2
+#  endif
+#  define zError                z_zError
+#  ifndef Z_SOLO
+#    define zcalloc               z_zcalloc
+#    define zcfree                z_zcfree
+#  endif
+#  define zlibCompileFlags      z_zlibCompileFlags
+#  define zlibVersion           z_zlibVersion
+
+/* all zlib typedefs in zlib.h and zconf.h */
+#  define Byte                  z_Byte
+#  define Bytef                 z_Bytef
+#  define alloc_func            z_alloc_func
+#  define charf                 z_charf
+#  define free_func             z_free_func
+#  ifndef Z_SOLO
+#    define gzFile                z_gzFile
+#  endif
+#  define gz_header             z_gz_header
+#  define gz_headerp            z_gz_headerp
+#  define in_func               z_in_func
+#  define intf                  z_intf
+#  define out_func              z_out_func
+#  define uInt                  z_uInt
+#  define uIntf                 z_uIntf
+#  define uLong                 z_uLong
+#  define uLongf                z_uLongf
+#  define voidp                 z_voidp
+#  define voidpc                z_voidpc
+#  define voidpf                z_voidpf
+
+/* all zlib structs in zlib.h and zconf.h */
+#  define gz_header_s           z_gz_header_s
+#  define internal_state        z_internal_state
+
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+#  define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+#  define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+#  define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+#  ifndef WIN32
+#    define WIN32
+#  endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+#  if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+#    ifndef SYS16BIT
+#      define SYS16BIT
+#    endif
+#  endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+#  define MAXSEG_64K
+#endif
+#ifdef MSDOS
+#  define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+#  ifndef STDC
+#    define STDC
+#  endif
+#  if __STDC_VERSION__ >= 199901L
+#    ifndef STDC99
+#      define STDC99
+#    endif
+#  endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+#  define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC)    /* iSeries (formerly AS/400). */
+#  define STDC
+#endif
+
+#ifndef STDC
+#  ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+#    define const       /* note: need a more gentle solution here */
+#  endif
+#endif
+
+#if defined(ZLIB_CONST) && !defined(z_const)
+#  define z_const const
+#else
+#  define z_const
+#endif
+
+#ifdef Z_SOLO
+   typedef unsigned long z_size_t;
+#else
+#  define z_longlong long long
+#  if defined(NO_SIZE_T)
+     typedef unsigned NO_SIZE_T z_size_t;
+#  elif defined(STDC)
+#    include <stddef.h>
+     typedef size_t z_size_t;
+#  else
+     typedef unsigned long z_size_t;
+#  endif
+#  undef z_longlong
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  ifdef MAXSEG_64K
+#    define MAX_MEM_LEVEL 8
+#  else
+#    define MAX_MEM_LEVEL 9
+#  endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus about 7 kilobytes
+ for small objects.
+*/
+
+                        /* Type declarations */
+
+#ifndef OF /* function prototypes */
+#  ifdef STDC
+#    define OF(args)  args
+#  else
+#    define OF(args)  ()
+#  endif
+#endif
+
+#ifndef Z_ARG /* function prototypes for stdarg */
+#  if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#    define Z_ARG(args)  args
+#  else
+#    define Z_ARG(args)  ()
+#  endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h.  If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+#  if defined(M_I86SM) || defined(M_I86MM)
+     /* MSC small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef _MSC_VER
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#  if (defined(__SMALL__) || defined(__MEDIUM__))
+     /* Turbo C small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef __BORLANDC__
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+   /* If building or using zlib as a DLL, define ZLIB_DLL.
+    * This is not mandatory, but it offers a little performance increase.
+    */
+#  ifdef ZLIB_DLL
+#    if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+#      ifdef ZLIB_INTERNAL
+#        define ZEXTERN extern __declspec(dllexport)
+#      else
+#        define ZEXTERN extern __declspec(dllimport)
+#      endif
+#    endif
+#  endif  /* ZLIB_DLL */
+   /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+    * define ZLIB_WINAPI.
+    * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+    */
+#  ifdef ZLIB_WINAPI
+#    ifdef FAR
+#      undef FAR
+#    endif
+#    include <windows.h>
+     /* No need for _export, use ZLIB.DEF instead. */
+     /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#    define ZEXPORT WINAPI
+#    ifdef WIN32
+#      define ZEXPORTVA WINAPIV
+#    else
+#      define ZEXPORTVA FAR CDECL
+#    endif
+#  endif
+#endif
+
+#if defined (__BEOS__)
+#  ifdef ZLIB_DLL
+#    ifdef ZLIB_INTERNAL
+#      define ZEXPORT   __declspec(dllexport)
+#      define ZEXPORTVA __declspec(dllexport)
+#    else
+#      define ZEXPORT   __declspec(dllimport)
+#      define ZEXPORTVA __declspec(dllimport)
+#    endif
+#  endif
+#endif
+
+#ifndef ZEXTERN
+#  define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+#  define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+#  define ZEXPORTVA
+#endif
+
+#ifndef FAR
+#  define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char  Byte;  /* 8 bits */
+#endif
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+   /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+#  define Bytef Byte FAR
+#else
+   typedef Byte  FAR Bytef;
+#endif
+typedef char  FAR charf;
+typedef int   FAR intf;
+typedef uInt  FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+   typedef void const *voidpc;
+   typedef void FAR   *voidpf;
+   typedef void       *voidp;
+#else
+   typedef Byte const *voidpc;
+   typedef Byte FAR   *voidpf;
+   typedef Byte       *voidp;
+#endif
+
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+#  include <limits.h>
+#  if (UINT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned
+#  elif (ULONG_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned long
+#  elif (USHRT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned short
+#  endif
+#endif
+
+#ifdef Z_U4
+   typedef Z_U4 z_crc_t;
+#else
+   typedef unsigned long z_crc_t;
+#endif
+
+#if 1    /* was set to #if 1 by ./configure */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#if 1    /* was set to #if 1 by ./configure */
+#  define Z_HAVE_STDARG_H
+#endif
+
+#ifdef STDC
+#  ifndef Z_SOLO
+#    include <sys/types.h>      /* for off_t */
+#  endif
+#endif
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+#    include <stdarg.h>         /* for va_list */
+#  endif
+#endif
+
+#ifdef _WIN32
+#  ifndef Z_SOLO
+#    include <stddef.h>         /* for wchar_t */
+#  endif
+#endif
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H)
+#  define Z_HAVE_UNISTD_H
+#endif
+#ifndef Z_SOLO
+#  if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+#    include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#    ifdef VMS
+#      include <unixio.h>       /* for off_t */
+#    endif
+#    ifndef z_off_t
+#      define z_off_t off_t
+#    endif
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+  #pragma map(deflateInit_,"DEIN")
+  #pragma map(deflateInit2_,"DEIN2")
+  #pragma map(deflateEnd,"DEEND")
+  #pragma map(deflateBound,"DEBND")
+  #pragma map(inflateInit_,"ININ")
+  #pragma map(inflateInit2_,"ININ2")
+  #pragma map(inflateEnd,"INEND")
+  #pragma map(inflateSync,"INSY")
+  #pragma map(inflateSetDictionary,"INSEDI")
+  #pragma map(compressBound,"CMBND")
+  #pragma map(inflate_table,"INTABL")
+  #pragma map(inflate_fast,"INFA")
+  #pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/deps/SZ/zlib/zlib.h b/deps/SZ/zlib/zlib.h
new file mode 100644
index 0000000000000000000000000000000000000000..f09cdaf1e0543de911d8220befdb51fa8632a9e6
--- /dev/null
+++ b/deps/SZ/zlib/zlib.h
@@ -0,0 +1,1912 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.11, January 15th, 2017
+
+  Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.11"
+#define ZLIB_VERNUM 0x12b0
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 2
+#define ZLIB_VER_REVISION 11
+#define ZLIB_VER_SUBREVISION 0
+
+/*
+    The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed data.
+  This version of the library supports only one compression method (deflation)
+  but other algorithms will be added later and will have the same stream
+  interface.
+
+    Compression can be done in a single step if the buffers are large enough,
+  or can be done by repeated calls of the compression function.  In the latter
+  case, the application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+    The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+    The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio using the functions that start
+  with "gz".  The gzip format is different from the zlib format.  gzip is a
+  gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+    This library can optionally read and write gzip and raw deflate streams in
+  memory as well.
+
+    The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+    The library does not install any signal handler.  The decoder checks
+  the consistency of the compressed data, so the library should never crash
+  even in the case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    z_const Bytef *next_in;     /* next input byte */
+    uInt     avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total number of input bytes read so far */
+
+    Bytef    *next_out; /* next output byte will go here */
+    uInt     avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total number of bytes output so far */
+
+    z_const char *msg;  /* last error message, NULL if no error */
+    struct internal_state FAR *state; /* not visible by applications */
+
+    alloc_func zalloc;  /* used to allocate the internal state */
+    free_func  zfree;   /* used to free the internal state */
+    voidpf     opaque;  /* private data object passed to zalloc and zfree */
+
+    int     data_type;  /* best guess about the data type: binary or text
+                           for deflate, or the decoding state for inflate */
+    uLong   adler;      /* Adler-32 or CRC-32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+     gzip header information passed to and from zlib routines.  See RFC 1952
+  for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+    int     text;       /* true if compressed data believed to be text */
+    uLong   time;       /* modification time */
+    int     xflags;     /* extra flags (not used when writing a gzip file) */
+    int     os;         /* operating system */
+    Bytef   *extra;     /* pointer to extra field or Z_NULL if none */
+    uInt    extra_len;  /* extra field length (valid if extra != Z_NULL) */
+    uInt    extra_max;  /* space at extra (only when reading header) */
+    Bytef   *name;      /* pointer to zero-terminated file name or Z_NULL */
+    uInt    name_max;   /* space at name (only when reading header) */
+    Bytef   *comment;   /* pointer to zero-terminated comment or Z_NULL */
+    uInt    comm_max;   /* space at comment (only when reading header) */
+    int     hcrc;       /* true if there was or will be a header crc */
+    int     done;       /* true when done reading gzip header (not used
+                           when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+     The application must update next_in and avail_in when avail_in has dropped
+   to zero.  It must update next_out and avail_out when avail_out has dropped
+   to zero.  The application must initialize zalloc, zfree and opaque before
+   calling the init function.  All other fields are set by the compression
+   library and must not be updated by the application.
+
+     The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree.  This can be useful for custom
+   memory management.  The compression library attaches no meaning to the
+   opaque value.
+
+     zalloc must return Z_NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.  In that case, zlib is thread-safe.  When zalloc and zfree are
+   Z_NULL on entry to the initialization function, they are set to internal
+   routines that use the standard library functions malloc() and free().
+
+     On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this if
+   the symbol MAXSEG_64K is defined (see zconf.h).  WARNING: On MSDOS, pointers
+   returned by zalloc for objects of exactly 65536 bytes *must* have their
+   offset normalized to zero.  The default allocation function provided by this
+   library ensures this (see zutil.c).  To reduce memory requirements and avoid
+   any allocation of 64K objects, at the expense of compression ratio, compile
+   the library with -DMAX_WBITS=14 (see zconf.h).
+
+     The fields total_in and total_out can be used for statistics or progress
+   reports.  After compression, total_in holds the total size of the
+   uncompressed data and may be saved for use by the decompressor (particularly
+   if the decompressor wants to decompress everything in a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH    2
+#define Z_FULL_FLUSH    3
+#define Z_FINISH        4
+#define Z_BLOCK         5
+#define Z_TREES         6
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_RLE                 3
+#define Z_FIXED               4
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_TEXT     1
+#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field for deflate() */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+
+                        /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is not
+   compatible with the zlib.h header file used by the application.  This check
+   is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+     Initializes the internal stream state for compression.  The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+   allocation functions.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at all
+   (the input data is simply copied a block at a time).  Z_DEFAULT_COMPRESSION
+   requests a default compromise between speed and compression (currently
+   equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if level is not a valid compression level, or
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).  msg is set to null
+   if there is no error message.  deflateInit does not perform any compression:
+   this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows.  deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary.  Some output may be provided even if
+    flush is zero.
+
+    Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating avail_in or avail_out accordingly; avail_out should
+  never be zero before the call.  The application can consume the compressed
+  output when it wants, for example when the output buffer is full (avail_out
+  == 0), or after each call of deflate().  If deflate returns Z_OK and with
+  zero avail_out, it must be called again after making room in the output
+  buffer because there might be more output pending. See deflatePending(),
+  which can be used if desired to determine whether or not there is more ouput
+  in that case.
+
+    Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+  decide how much data to accumulate before producing output, in order to
+  maximize compression.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far.  (In
+  particular avail_in is zero after the call if enough output space has been
+  provided before the call.) Flushing may degrade compression for some
+  compression algorithms and so it should be used only when necessary.  This
+  completes the current deflate block and follows it with an empty stored block
+  that is three bits plus filler bits to the next byte, followed by four bytes
+  (00 00 ff ff).
+
+    If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+  output buffer, but the output is not aligned to a byte boundary.  All of the
+  input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+  This completes the current deflate block and follows it with an empty fixed
+  codes block that is 10 bits long.  This assures that enough bytes are output
+  in order for the decompressor to finish the block before the empty fixed
+  codes block.
+
+    If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+  for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+  seven bits of the current block are held to be written as the next byte after
+  the next deflate block is completed.  In this case, the decompressor may not
+  be provided enough bits at this point in order to complete decompression of
+  the data provided so far to the compressor.  It may need to wait for the next
+  block to be emitted.  This is for advanced applications that need to control
+  the emission of deflate blocks.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired.  Using Z_FULL_FLUSH too often can seriously degrade
+  compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out is greater than six to avoid repeated flush markers due to
+  avail_out == 0 on return.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there was
+  enough output space.  If deflate returns with Z_OK or Z_BUF_ERROR, this
+  function must be called again with Z_FINISH and more output space (updated
+  avail_out) but no more input data, until it returns with Z_STREAM_END or an
+  error.  After deflate has returned Z_STREAM_END, the only possible operations
+  on the stream are deflateReset or deflateEnd.
+
+    Z_FINISH can be used in the first deflate call after deflateInit if all the
+  compression is to be done in a single step.  In order to complete in one
+  call, avail_out must be at least the value returned by deflateBound (see
+  below).  Then deflate is guaranteed to return Z_STREAM_END.  If not enough
+  output space is provided, deflate will not return Z_STREAM_END, and it must
+  be called again as described above.
+
+    deflate() sets strm->adler to the Adler-32 checksum of all input read
+  so far (that is, total_in bytes).  If a gzip stream is being generated, then
+  strm->adler will be the CRC-32 checksum of the input read so far.  (See
+  deflateInit2 below.)
+
+    deflate() may update strm->data_type if it can make a good guess about
+  the input data type (Z_BINARY or Z_TEXT).  If in doubt, the data is
+  considered binary.  This field is only for information purposes and does not
+  affect the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was Z_NULL or the state was inadvertently written over
+  by the application), or Z_BUF_ERROR if no progress is possible (for example
+  avail_in or avail_out was zero).  Note that Z_BUF_ERROR is not fatal, and
+  deflate() can be called again with more input and more output space to
+  continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded).  In the error case, msg
+   may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+     Initializes the internal stream state for decompression.  The fields
+   next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+   the caller.  In the current version of inflate, the provided input is not
+   read or consumed.  The allocation of a sliding window will be deferred to
+   the first call of inflate (if the decompression does not complete on the
+   first call).  If zalloc and zfree are set to Z_NULL, inflateInit updates
+   them to use default allocation functions.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit does not perform any decompression.
+   Actual decompression will be done by inflate().  So next_in, and avail_in,
+   next_out, and avail_out are unused and unchanged.  The current
+   implementation of inflateInit() does not process any header information --
+   that is deferred until inflate() is called.
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows.  inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), then next_in and avail_in are updated
+    accordingly, and processing will resume at this point for the next call of
+    inflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there is
+    no more input data or no more space in the output buffer (see below about
+    the flush parameter).
+
+    Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating the next_* and avail_* values accordingly.  If the
+  caller of inflate() does not provide both available input and available
+  output space, it is possible that there will be no progress made.  The
+  application can consume the uncompressed output when it wants, for example
+  when the output buffer is full (avail_out == 0), or after each call of
+  inflate().  If inflate returns Z_OK and with zero avail_out, it must be
+  called again after making room in the output buffer because there might be
+  more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+  Z_BLOCK, or Z_TREES.  Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer.  Z_BLOCK requests that inflate()
+  stop if and when it gets to the next deflate block boundary.  When decoding
+  the zlib or gzip format, this will cause inflate() to return immediately
+  after the header and before the first block.  When doing a raw inflate,
+  inflate() will go ahead and process the first block, and will return when it
+  gets to the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  To assist in this, on return inflate() always sets strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64 if
+  inflate() is currently decoding the last block in the deflate stream, plus
+  128 if inflate() returned immediately after decoding an end-of-block code or
+  decoding the complete header up to just before the first byte of the deflate
+  stream.  The end-of-block will not be indicated until all of the uncompressed
+  data from that block has been written to strm->next_out.  The number of
+  unused bits may in general be greater than seven, except when bit 7 of
+  data_type is set, in which case the number of unused bits will be less than
+  eight.  data_type is set as noted here every time inflate() returns for all
+  flush options, and so can be used to determine the amount of currently
+  consumed input in bits.
+
+    The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+  end of each deflate block header is reached, before any actual data in that
+  block is decoded.  This allows the caller to determine the length of the
+  deflate block header for later use in random access within a deflate block.
+  256 is added to the value of strm->data_type when inflate() returns
+  immediately after reaching the end of the deflate block header.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error.  However if all decompression is to be performed in a single step (a
+  single call of inflate), the parameter flush should be set to Z_FINISH.  In
+  this case all pending input is processed and all pending output is flushed;
+  avail_out must be large enough to hold all of the uncompressed data for the
+  operation to complete.  (The size of the uncompressed data may have been
+  saved by the compressor for this purpose.)  The use of Z_FINISH is not
+  required to perform an inflation in one step.  However it may be used to
+  inform inflate that a faster approach can be used for the single inflate()
+  call.  Z_FINISH also informs inflate to not maintain a sliding window if the
+  stream completes, which reduces inflate's memory footprint.  If the stream
+  does not complete, either because not all of the stream is provided or not
+  enough output space is provided, then a sliding window will be allocated and
+  inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+  been used.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call.  So the effects of the flush parameter in this implementation are
+  on the return value of inflate() as noted below, when inflate() returns early
+  when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+  memory for a sliding window when Z_FINISH is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the Adler-32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below.  At the end of the stream, inflate() checks that its computed Adler-32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically, if requested when
+  initializing with inflateInit2().  Any information contained in the gzip
+  header is not retained unless inflateGetHeader() is used.  When processing
+  gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+  produced so far.  The CRC-32 is checked against the gzip trailer, as is the
+  uncompressed length, modulo 2^32.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value, in which case strm->msg points to a string with a more specific
+  error), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  next_in or next_out was Z_NULL, or the state was inadvertently written over
+  by the application), Z_MEM_ERROR if there was not enough memory, Z_BUF_ERROR
+  if no progress was possible or if there was not enough room in the output
+  buffer when Z_FINISH is used.  Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing.  If Z_DATA_ERROR is returned, the application may
+  then call inflateSync() to look for a good compression block if a partial
+  recovery of the data is to be attempted.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     inflateEnd returns Z_OK if success, or Z_STREAM_ERROR if the stream state
+   was inconsistent.
+*/
+
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+                                     int  level,
+                                     int  method,
+                                     int  windowBits,
+                                     int  memLevel,
+                                     int  strategy));
+
+     This is another version of deflateInit with more compression options.  The
+   fields next_in, zalloc, zfree and opaque must be initialized before by the
+   caller.
+
+     The method parameter is the compression method.  It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library.  Larger values of this parameter result in better
+   compression at the expense of memory usage.  The default value is 15 if
+   deflateInit is used instead.
+
+     For the current implementation of deflate(), a windowBits value of 8 (a
+   window size of 256 bytes) is not supported.  As a result, a request for 8
+   will result in 9 (a 512-byte window).  In that case, providing 8 to
+   inflateInit2() will result in an error when the zlib header with 9 is
+   checked against the initialization of inflate().  The remedy is to not use 8
+   with deflateInit2() with this initialization, or at least in that case use 9
+   with inflateInit2().
+
+     windowBits can also be -8..-15 for raw deflate.  In this case, -windowBits
+   determines the window size.  deflate() will then generate raw deflate data
+   with no zlib header or trailer, and will not compute a check value.
+
+     windowBits can also be greater than 15 for optional gzip encoding.  Add
+   16 to windowBits to write a simple gzip header and trailer around the
+   compressed data instead of a zlib wrapper.  The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero), no
+   header crc, and the operating system will be set to the appropriate value,
+   if the operating system was determined at compile time.  If a gzip stream is
+   being written, strm->adler is a CRC-32 instead of an Adler-32.
+
+     For raw deflate or gzip encoding, a request for a 256-byte window is
+   rejected as invalid, since only the zlib header provides a means of
+   transmitting the window size to the decompressor.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state.  memLevel=1 uses minimum memory but is
+   slow and reduces compression ratio; memLevel=9 uses maximum memory for
+   optimal speed.  The default value is 8.  See zconf.h for total memory usage
+   as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm.  Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match), or Z_RLE to limit match distances to one (run-length
+   encoding).  Filtered data consists mostly of small values with a somewhat
+   random distribution.  In this case, the compression algorithm is tuned to
+   compress them better.  The effect of Z_FILTERED is to force more Huffman
+   coding and less string matching; it is somewhat intermediate between
+   Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY.  Z_RLE is designed to be almost as
+   fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data.  The
+   strategy parameter only affects the compression ratio but not the
+   correctness of the compressed output even if it is not set appropriately.
+   Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+   decoder for special applications.
+
+     deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+   method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+   incompatible with the version assumed by the caller (ZLIB_VERSION).  msg is
+   set to null if there is no error message.  deflateInit2 does not perform any
+   compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the compression dictionary from the given byte sequence
+   without producing any compressed output.  When using the zlib format, this
+   function must be called immediately after deflateInit, deflateInit2 or
+   deflateReset, and before any call of deflate.  When doing raw deflate, this
+   function must be called either before any call of deflate, or immediately
+   after the completion of a deflate block, i.e. after all input has been
+   consumed and all output has been delivered when using any of the flush
+   options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH.  The
+   compressor and decompressor must use exactly the same dictionary (see
+   inflateSetDictionary).
+
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary.  Using a
+   dictionary is most useful when the data to be compressed is short and can be
+   predicted with good accuracy; the data can then be compressed better than
+   with the default empty dictionary.
+
+     Depending on the size of the compression data structures selected by
+   deflateInit or deflateInit2, a part of the dictionary may in effect be
+   discarded, for example if the dictionary is larger than the window size
+   provided in deflateInit or deflateInit2.  Thus the strings most likely to be
+   useful should be put at the end of the dictionary, not at the front.  In
+   addition, the current implementation of deflate will use at most the window
+   size minus 262 bytes of the provided dictionary.
+
+     Upon return of this function, strm->adler is set to the Adler-32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor.  (The Adler-32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.) If a raw deflate was requested, then the
+   Adler-32 value is not computed and strm->adler is not set.
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent (for example if deflate has already been called for this stream
+   or if not at a block boundary for raw deflate).  deflateSetDictionary does
+   not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateGetDictionary OF((z_streamp strm,
+                                             Bytef *dictionary,
+                                             uInt  *dictLength));
+/*
+     Returns the sliding dictionary being maintained by deflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If deflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similary, if dictLength is Z_NULL, then it is not set.
+
+     deflateGetDictionary() may return a length less than the window size, even
+   when more than the window size in input has been provided. It may return up
+   to 258 bytes less in that case, due to how zlib's implementation of deflate
+   manages the sliding window and lookahead for matches, where matches can be
+   up to 258 bytes long. If the application needs the last window-size bytes of
+   input, then that would need to be saved by the application outside of zlib.
+
+     deflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter.  The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and can
+   consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to deflateEnd followed by deflateInit, but
+   does not free and reallocate the internal compression state.  The stream
+   will leave the compression level and any other attributes that may have been
+   set unchanged.
+
+     deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+                                      int level,
+                                      int strategy));
+/*
+     Dynamically update the compression level and compression strategy.  The
+   interpretation of level and strategy is as in deflateInit2().  This can be
+   used to switch between compression and straight copy of the input data, or
+   to switch to a different kind of input data requiring a different strategy.
+   If the compression approach (which is a function of the level) or the
+   strategy is changed, and if any input has been consumed in a previous
+   deflate() call, then the input available so far is compressed with the old
+   level and strategy using deflate(strm, Z_BLOCK).  There are three approaches
+   for the compression levels 0, 1..3, and 4..9 respectively.  The new level
+   and strategy will take effect at the next call of deflate().
+
+     If a deflate(strm, Z_BLOCK) is performed by deflateParams(), and it does
+   not have enough output space to complete, then the parameter change will not
+   take effect.  In this case, deflateParams() can be called again with the
+   same parameters and more output space to try again.
+
+     In order to assure a change in the parameters on the first try, the
+   deflate stream should be flushed using deflate() with Z_BLOCK or other flush
+   request until strm.avail_out is not zero, before calling deflateParams().
+   Then no more input data should be provided before the deflateParams() call.
+   If this is done, the old level and strategy will be applied to the data
+   compressed before deflateParams(), and the new level and strategy will be
+   applied to the the data compressed after deflateParams().
+
+     deflateParams returns Z_OK on success, Z_STREAM_ERROR if the source stream
+   state was inconsistent or if a parameter was invalid, or Z_BUF_ERROR if
+   there was not enough output space to complete the compression of the
+   available input data before a change in the strategy or approach.  Note that
+   in the case of a Z_BUF_ERROR, the parameters are not changed.  A return
+   value of Z_BUF_ERROR is not fatal, in which case deflateParams() can be
+   retried with more output space.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+                                    int good_length,
+                                    int max_lazy,
+                                    int nice_length,
+                                    int max_chain));
+/*
+     Fine tune deflate's internal compression parameters.  This should only be
+   used by someone who understands the algorithm used by zlib's deflate for
+   searching for the best matching string, and even then only by the most
+   fanatic optimizer trying to squeeze out the last compressed bit for their
+   specific input data.  Read the deflate.c source code for the meaning of the
+   max_lazy, good_length, nice_length, and max_chain parameters.
+
+     deflateTune() can be called after deflateInit() or deflateInit2(), and
+   returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+                                       uLong sourceLen));
+/*
+     deflateBound() returns an upper bound on the compressed size after
+   deflation of sourceLen bytes.  It must be called after deflateInit() or
+   deflateInit2(), and after deflateSetHeader(), if used.  This would be used
+   to allocate an output buffer for deflation in a single pass, and so would be
+   called before deflate().  If that first deflate() call is provided the
+   sourceLen input bytes, an output buffer allocated to the size returned by
+   deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+   to return Z_STREAM_END.  Note that it is possible for the compressed size to
+   be larger than the value returned by deflateBound() if flush options other
+   than Z_FINISH or Z_NO_FLUSH are used.
+*/
+
+ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
+                                       unsigned *pending,
+                                       int *bits));
+/*
+     deflatePending() returns the number of bytes and bits of output that have
+   been generated, but not yet provided in the available output.  The bytes not
+   provided would be due to the available output space having being consumed.
+   The number of bits of output not provided are between 0 and 7, where they
+   await more bits to join them in order to fill out a full byte.  If pending
+   or bits are Z_NULL, then those values are not set.
+
+     deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+ */
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     deflatePrime() inserts bits in the deflate output stream.  The intent
+   is that this function is used to start off the deflate output with the bits
+   leftover from a previous deflate stream when appending to it.  As such, this
+   function can only be used for raw deflate, and must be used before the first
+   deflate() call after a deflateInit2() or deflateReset().  bits must be less
+   than or equal to 16, and that many of the least significant bits of value
+   will be inserted in the output.
+
+     deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+   room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+     deflateSetHeader() provides gzip header information for when a gzip
+   stream is requested by deflateInit2().  deflateSetHeader() may be called
+   after deflateInit2() or deflateReset() and before the first call of
+   deflate().  The text, time, os, extra field, name, and comment information
+   in the provided gz_header structure are written to the gzip header (xflag is
+   ignored -- the extra flags are set according to the compression level).  The
+   caller must assure that, if not Z_NULL, name and comment are terminated with
+   a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+   available there.  If hcrc is true, a gzip header crc is included.  Note that
+   the current versions of the command-line version of gzip (up through version
+   1.3.x) do not support header crc's, and will report that it is a "multi-part
+   gzip file" and give up.
+
+     If deflateSetHeader is not used, the default gzip header has text false,
+   the time set to zero, and os set to 255, with no extra, name, or comment
+   fields.  The gzip header is returned to the default state by deflateReset().
+
+     deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+                                     int  windowBits));
+
+     This is another version of inflateInit with an extra parameter.  The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library.  The default value is 15 if inflateInit is used
+   instead.  windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used.  If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be zero to request that inflate use the window size in
+   the zlib header of the compressed stream.
+
+     windowBits can also be -8..-15 for raw inflate.  In this case, -windowBits
+   determines the window size.  inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream.  This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values.  If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an Adler-32 or a CRC-32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is.  Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding.  Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is a
+   CRC-32 instead of an Adler-32.  Unlike the gunzip utility and gzread() (see
+   below), inflate() will not automatically decode concatenated gzip streams.
+   inflate() will return Z_STREAM_END at the end of the gzip stream.  The state
+   would need to be reset to continue decoding a subsequent gzip stream.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit2 does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit2() does not process any header information -- that is
+   deferred until inflate() is called.
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the decompression dictionary from the given uncompressed byte
+   sequence.  This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT.  The dictionary chosen by the compressor
+   can be determined from the Adler-32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called at any
+   time to set the dictionary.  If the provided dictionary is smaller than the
+   window and there is already data in the window, then the provided dictionary
+   will amend what's there.  The application must insure that the dictionary
+   that was used for compression is provided.
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect Adler-32 value).  inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
+                                             Bytef *dictionary,
+                                             uInt  *dictLength));
+/*
+     Returns the sliding dictionary being maintained by inflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If inflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similary, if dictLength is Z_NULL, then it is not set.
+
+     inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+     Skips invalid compressed data until a possible full flush point (see above
+   for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+   available input is skipped.  No output is provided.
+
+     inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+   All full flush points have this pattern, but not all occurrences of this
+   pattern are full flush points.
+
+     inflateSync returns Z_OK if a possible full flush point has been found,
+   Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+   has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+   In the success case, the application may save the current current value of
+   total_in which indicates where valid compressed data was found.  In the
+   error case, the application may repeatedly call inflateSync, providing more
+   input each time, until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when randomly accessing a large stream.  The
+   first pass through the stream can periodically record the inflate state,
+   allowing restarting inflate at those points when randomly accessing the
+   stream.
+
+     inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate the internal decompression state.  The
+   stream will keep attributes that may have been set by inflateInit2.
+
+     inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
+                                      int windowBits));
+/*
+     This function is the same as inflateReset, but it also permits changing
+   the wrap and window size requests.  The windowBits parameter is interpreted
+   the same as it is for inflateInit2.  If the window size is changed, then the
+   memory allocated for the window is freed, and the window will be reallocated
+   by inflate() if needed.
+
+     inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+   the windowBits parameter is invalid.
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     This function inserts bits in the inflate input stream.  The intent is
+   that this function is used to start inflating at a bit position in the
+   middle of a byte.  The provided bits will be used before any bytes are used
+   from next_in.  This function should only be used with raw inflate, and
+   should be used before the first inflate() call after inflateInit2() or
+   inflateReset().  bits must be less than or equal to 16, and that many of the
+   least significant bits of value will be inserted in the input.
+
+     If bits is negative, then the input stream bit buffer is emptied.  Then
+   inflatePrime() can be called again to put bits in the buffer.  This is used
+   to clear out bits leftover after feeding inflate a block description prior
+   to feeding inflate codes.
+
+     inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+/*
+     This function returns two values, one in the lower 16 bits of the return
+   value, and the other in the remaining upper bits, obtained by shifting the
+   return value down 16 bits.  If the upper value is -1 and the lower value is
+   zero, then inflate() is currently decoding information outside of a block.
+   If the upper value is -1 and the lower value is non-zero, then inflate is in
+   the middle of a stored block, with the lower value equaling the number of
+   bytes from the input remaining to copy.  If the upper value is not -1, then
+   it is the number of bits back from the current bit position in the input of
+   the code (literal or length/distance pair) currently being processed.  In
+   that case the lower value is the number of bytes already emitted for that
+   code.
+
+     A code is being processed if inflate is waiting for more input to complete
+   decoding of the code, or if it has completed decoding but is waiting for
+   more output space to write the literal or match data.
+
+     inflateMark() is used to mark locations in the input data for random
+   access, which may be at bit positions, and to note those cases where the
+   output of a code may span boundaries of random access blocks.  The current
+   location in the input stream can be determined from avail_in and data_type
+   as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+     inflateMark returns the value noted above, or -65536 if the provided
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+     inflateGetHeader() requests that gzip header information be stored in the
+   provided gz_header structure.  inflateGetHeader() may be called after
+   inflateInit2() or inflateReset(), and before the first call of inflate().
+   As inflate() processes the gzip stream, head->done is zero until the header
+   is completed, at which time head->done is set to one.  If a zlib stream is
+   being decoded, then head->done is set to -1 to indicate that there will be
+   no gzip header information forthcoming.  Note that Z_BLOCK or Z_TREES can be
+   used to force inflate() to return immediately after header processing is
+   complete and before any actual data is decompressed.
+
+     The text, time, xflags, and os fields are filled in with the gzip header
+   contents.  hcrc is set to true if there is a header CRC.  (The header CRC
+   was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+   contains the maximum number of bytes to write to extra.  Once done is true,
+   extra_len contains the actual extra field length, and extra contains the
+   extra field, or that field truncated if extra_max is less than extra_len.
+   If name is not Z_NULL, then up to name_max characters are written there,
+   terminated with a zero unless the length is greater than name_max.  If
+   comment is not Z_NULL, then up to comm_max characters are written there,
+   terminated with a zero unless the length is greater than comm_max.  When any
+   of extra, name, or comment are not Z_NULL and the respective field is not
+   present in the header, then that field is set to Z_NULL to signal its
+   absence.  This allows the use of deflateSetHeader() with the returned
+   structure to duplicate the header.  However if those fields are set to
+   allocated memory, then the application will need to save those pointers
+   elsewhere so that they can be eventually freed.
+
+     If inflateGetHeader is not used, then the header information is simply
+   discarded.  The header is always checked for validity, including the header
+   CRC if present.  inflateReset() will reset the process to discard the header
+   information.  The application would need to call inflateGetHeader() again to
+   retrieve the header from the next gzip stream.
+
+     inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+                                        unsigned char FAR *window));
+
+     Initialize the internal stream state for decompression using inflateBack()
+   calls.  The fields zalloc, zfree and opaque in strm must be initialized
+   before the call.  If zalloc and zfree are Z_NULL, then the default library-
+   derived memory allocation routines are used.  windowBits is the base two
+   logarithm of the window size, in the range 8..15.  window is a caller
+   supplied buffer of that size.  Except for special applications where it is
+   assured that deflate was used with small window sizes, windowBits must be 15
+   and a 32K byte window must be supplied to be able to decompress general
+   deflate streams.
+
+     See inflateBack() for the usage of these routines.
+
+     inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+   the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+   allocated, or Z_VERSION_ERROR if the version of the library does not match
+   the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *,
+                                z_const unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+                                    in_func in, void FAR *in_desc,
+                                    out_func out, void FAR *out_desc));
+/*
+     inflateBack() does a raw inflate with a single call using a call-back
+   interface for input and output.  This is potentially more efficient than
+   inflate() for file i/o applications, in that it avoids copying between the
+   output and the sliding window by simply making the window itself the output
+   buffer.  inflate() can be faster on modern CPUs when used with large
+   buffers.  inflateBack() trusts the application to not change the output
+   buffer passed by the output function, at least until inflateBack() returns.
+
+     inflateBackInit() must be called first to allocate the internal state
+   and to initialize the state with the user-provided window buffer.
+   inflateBack() may then be used multiple times to inflate a complete, raw
+   deflate stream with each call.  inflateBackEnd() is then called to free the
+   allocated state.
+
+     A raw deflate stream is one with no zlib or gzip header or trailer.
+   This routine would normally be used in a utility that reads zip or gzip
+   files and writes out uncompressed files.  The utility would decode the
+   header and process the trailer on its own, hence this routine expects only
+   the raw deflate stream to decompress.  This is different from the default
+   behavior of inflate(), which expects a zlib header and trailer around the
+   deflate stream.
+
+     inflateBack() uses two subroutines supplied by the caller that are then
+   called by inflateBack() for input and output.  inflateBack() calls those
+   routines until it reads a complete deflate stream and writes out all of the
+   uncompressed data, or until it encounters an error.  The function's
+   parameters and return types are defined above in the in_func and out_func
+   typedefs.  inflateBack() will call in(in_desc, &buf) which should return the
+   number of bytes of provided input, and a pointer to that input in buf.  If
+   there is no input available, in() must return zero -- buf is ignored in that
+   case -- and inflateBack() will return a buffer error.  inflateBack() will
+   call out(out_desc, buf, len) to write the uncompressed data buf[0..len-1].
+   out() should return zero on success, or non-zero on failure.  If out()
+   returns non-zero, inflateBack() will return with an error.  Neither in() nor
+   out() are permitted to change the contents of the window provided to
+   inflateBackInit(), which is also the buffer that out() uses to write from.
+   The length written by out() will be at most the window size.  Any non-zero
+   amount of input may be provided by in().
+
+     For convenience, inflateBack() can be provided input on the first call by
+   setting strm->next_in and strm->avail_in.  If that input is exhausted, then
+   in() will be called.  Therefore strm->next_in must be initialized before
+   calling inflateBack().  If strm->next_in is Z_NULL, then in() will be called
+   immediately for input.  If strm->next_in is not Z_NULL, then strm->avail_in
+   must also be initialized, and then if strm->avail_in is not zero, input will
+   initially be taken from strm->next_in[0 ..  strm->avail_in - 1].
+
+     The in_desc and out_desc parameters of inflateBack() is passed as the
+   first parameter of in() and out() respectively when they are called.  These
+   descriptors can be optionally used to pass any information that the caller-
+   supplied in() and out() functions need to do their job.
+
+     On return, inflateBack() will set strm->next_in and strm->avail_in to
+   pass back any unused input that was provided by the last in() call.  The
+   return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+   in the deflate stream (in which case strm->msg is set to indicate the nature
+   of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+   In the case of Z_BUF_ERROR, an input or output error can be distinguished
+   using strm->next_in which will be Z_NULL only if in() returned an error.  If
+   strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+   non-zero.  (in() will always be called before out(), so strm->next_in is
+   assured to be defined if out() returns non-zero.)  Note that inflateBack()
+   cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+     All memory allocated by inflateBackInit() is freed.
+
+     inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+   state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+    Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+     1.0: size of uInt
+     3.2: size of uLong
+     5.4: size of voidpf (pointer)
+     7.6: size of z_off_t
+
+    Compiler, assembler, and debug options:
+     8: ZLIB_DEBUG
+     9: ASMV or ASMINF -- use ASM code
+     10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+     11: 0 (reserved)
+
+    One-time table building (smaller code, but not thread-safe if true):
+     12: BUILDFIXED -- build static block decoding tables when needed
+     13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+     14,15: 0 (reserved)
+
+    Library content (indicates missing functionality):
+     16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+                          deflate code when not needed)
+     17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+                    and decode gzip streams (to avoid linking crc code)
+     18-19: 0 (reserved)
+
+    Operation variations (changes in library functionality):
+     20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+     21: FASTEST -- deflate algorithm with only one, lowest compression level
+     22,23: 0 (reserved)
+
+    The sprintf variant used by gzprintf (zero is best):
+     24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+     25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+     26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+    Remainder:
+     27-31: 0 (reserved)
+ */
+
+#ifndef Z_SOLO
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the basic
+   stream-oriented functions.  To simplify the interface, some default options
+   are assumed (compression level and memory usage, standard memory allocation
+   functions).  The source code of these utility functions can be modified if
+   you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
+                                 const Bytef *source, uLong sourceLen));
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.  compress() is equivalent to compress2() with a level
+   parameter of Z_DEFAULT_COMPRESSION.
+
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
+                                  const Bytef *source, uLong sourceLen,
+                                  int level));
+/*
+     Compresses the source buffer into the destination buffer.  The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer.  Upon entry, destLen is the total size of the
+   destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+     compressBound() returns an upper bound on the compressed size after
+   compress() or compress2() on sourceLen bytes.  It would be used before a
+   compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
+                                   const Bytef *source, uLong sourceLen));
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data.  (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit, destLen
+   is the actual size of the uncompressed data.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.  In
+   the case where there is not enough room, uncompress() will fill the output
+   buffer with the uncompressed data up to that point.
+*/
+
+ZEXTERN int ZEXPORT uncompress2 OF((Bytef *dest,   uLongf *destLen,
+                                    const Bytef *source, uLong *sourceLen));
+/*
+     Same as uncompress, except that sourceLen is a pointer, where the
+   length of the source is *sourceLen.  On return, *sourceLen is the number of
+   source bytes consumed.
+*/
+
+                        /* gzip file access functions */
+
+/*
+     This library supports reading and writing files in gzip (.gz) format with
+   an interface similar to that of stdio, using the functions that start with
+   "gz".  The gzip format is different from the zlib format.  gzip is a gzip
+   wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
+
+/*
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+
+     Opens a gzip (.gz) file for reading or writing.  The mode parameter is as
+   in fopen ("rb" or "wb") but can also include a compression level ("wb9") or
+   a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only
+   compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F'
+   for fixed code compression as in "wb9F".  (See the description of
+   deflateInit2 for more information about the strategy parameter.)  'T' will
+   request transparent writing or appending with no compression and not using
+   the gzip format.
+
+     "a" can be used instead of "w" to request that the gzip stream that will
+   be written be appended to the file.  "+" will result in an error, since
+   reading and writing to the same gzip file is not supported.  The addition of
+   "x" when writing will create the file exclusively, which fails if the file
+   already exists.  On systems that support it, the addition of "e" when
+   reading or writing will set the flag to close the file on an execve() call.
+
+     These functions, as well as gzip, will read and decode a sequence of gzip
+   streams in a file.  The append function of gzopen() can be used to create
+   such a file.  (Also see gzflush() for another way to do this.)  When
+   appending, gzopen does not test whether the file begins with a gzip stream,
+   nor does it look for the end of the gzip streams to begin appending.  gzopen
+   will simply append a gzip stream to the existing file.
+
+     gzopen can be used to read a file which is not in gzip format; in this
+   case gzread will directly read from the file without decompression.  When
+   reading, this will be detected automatically by looking for the magic two-
+   byte gzip header.
+
+     gzopen returns NULL if the file could not be opened, if there was
+   insufficient memory to allocate the gzFile state, or if an invalid mode was
+   specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+   errno can be checked to determine if the reason gzopen failed was that the
+   file could not be opened.
+*/
+
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+     gzdopen associates a gzFile with the file descriptor fd.  File descriptors
+   are obtained from calls like open, dup, creat, pipe or fileno (if the file
+   has been previously opened with fopen).  The mode parameter is as in gzopen.
+
+     The next call of gzclose on the returned gzFile will also close the file
+   descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+   fd.  If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+   mode);.  The duplicated descriptor should be saved to avoid a leak, since
+   gzdopen does not close fd if it fails.  If you are using fileno() to get the
+   file descriptor from a FILE *, then you will have to use dup() to avoid
+   double-close()ing the file descriptor.  Both gzclose() and fclose() will
+   close the associated file descriptor, so they need to have different file
+   descriptors.
+
+     gzdopen returns NULL if there was insufficient memory to allocate the
+   gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+   provided, or '+' was provided), or if fd is -1.  The file descriptor is not
+   used until the next gz* read, write, seek, or close operation, so gzdopen
+   will not detect if fd is invalid (unless fd is -1).
+*/
+
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
+/*
+     Set the internal buffer size used by this library's functions.  The
+   default buffer size is 8192 bytes.  This function must be called after
+   gzopen() or gzdopen(), and before any other calls that read or write the
+   file.  The buffer memory allocation is always deferred to the first read or
+   write.  Three times that size in buffer space is allocated.  A larger buffer
+   size of, for example, 64K or 128K bytes will noticeably increase the speed
+   of decompression (reading).
+
+     The new buffer size also affects the maximum length for gzprintf().
+
+     gzbuffer() returns 0 on success, or -1 on failure, such as being called
+   too late.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+     Dynamically update the compression level or strategy.  See the description
+   of deflateInit2 for the meaning of these parameters.  Previously provided
+   data is flushed before the parameter change.
+
+     gzsetparams returns Z_OK if success, Z_STREAM_ERROR if the file was not
+   opened for writing, Z_ERRNO if there is an error writing the flushed data,
+   or Z_MEM_ERROR if there is a memory allocation error.
+*/
+
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+/*
+     Reads the given number of uncompressed bytes from the compressed file.  If
+   the input file is not in gzip format, gzread copies the given number of
+   bytes into the buffer directly from the file.
+
+     After reaching the end of a gzip stream in the input, gzread will continue
+   to read, looking for another gzip stream.  Any number of gzip streams may be
+   concatenated in the input file, and will all be decompressed by gzread().
+   If something other than a gzip stream is encountered after a gzip stream,
+   that remaining trailing garbage is ignored (and no error is returned).
+
+     gzread can be used to read a gzip file that is being concurrently written.
+   Upon reaching the end of the input, gzread will return with the available
+   data.  If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+   gzclearerr can be used to clear the end of file indicator in order to permit
+   gzread to be tried again.  Z_OK indicates that a gzip stream was completed
+   on the last gzread.  Z_BUF_ERROR indicates that the input file ended in the
+   middle of a gzip stream.  Note that gzread does not return -1 in the event
+   of an incomplete gzip stream.  This error is deferred until gzclose(), which
+   will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+   stream.  Alternatively, gzerror can be used before gzclose to detect this
+   case.
+
+     gzread returns the number of uncompressed bytes actually read, less than
+   len for end of file, or -1 for error.  If len is too large to fit in an int,
+   then nothing is read, -1 is returned, and the error state is set to
+   Z_STREAM_ERROR.
+*/
+
+ZEXTERN z_size_t ZEXPORT gzfread OF((voidp buf, z_size_t size, z_size_t nitems,
+                                     gzFile file));
+/*
+     Read up to nitems items of size size from file to buf, otherwise operating
+   as gzread() does.  This duplicates the interface of stdio's fread(), with
+   size_t request and return types.  If the library defines size_t, then
+   z_size_t is identical to size_t.  If not, then z_size_t is an unsigned
+   integer type that can contain a pointer.
+
+     gzfread() returns the number of full items read of size size, or zero if
+   the end of the file was reached and a full item could not be read, or if
+   there was an error.  gzerror() must be consulted if zero is returned in
+   order to determine if there was an error.  If the multiplication of size and
+   nitems overflows, i.e. the product does not fit in a z_size_t, then nothing
+   is read, zero is returned, and the error state is set to Z_STREAM_ERROR.
+
+     In the event that the end of file is reached and only a partial item is
+   available at the end, i.e. the remaining uncompressed data length is not a
+   multiple of size, then the final partial item is nevetheless read into buf
+   and the end-of-file flag is set.  The length of the partial item read is not
+   provided, but could be inferred from the result of gztell().  This behavior
+   is the same as the behavior of fread() implementations in common libraries,
+   but it prevents the direct use of gzfread() to read a concurrently written
+   file, reseting and retrying on end-of-file, when size is not 1.
+*/
+
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+                                voidpc buf, unsigned len));
+/*
+     Writes the given number of uncompressed bytes into the compressed file.
+   gzwrite returns the number of uncompressed bytes written or 0 in case of
+   error.
+*/
+
+ZEXTERN z_size_t ZEXPORT gzfwrite OF((voidpc buf, z_size_t size,
+                                      z_size_t nitems, gzFile file));
+/*
+     gzfwrite() writes nitems items of size size from buf to file, duplicating
+   the interface of stdio's fwrite(), with size_t request and return types.  If
+   the library defines size_t, then z_size_t is identical to size_t.  If not,
+   then z_size_t is an unsigned integer type that can contain a pointer.
+
+     gzfwrite() returns the number of full items written of size size, or zero
+   if there was an error.  If the multiplication of size and nitems overflows,
+   i.e. the product does not fit in a z_size_t, then nothing is written, zero
+   is returned, and the error state is set to Z_STREAM_ERROR.
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
+/*
+     Converts, formats, and writes the arguments to the compressed file under
+   control of the format string, as in fprintf.  gzprintf returns the number of
+   uncompressed bytes actually written, or a negative zlib error code in case
+   of error.  The number of uncompressed bytes written is limited to 8191, or
+   one less than the buffer size given to gzbuffer().  The caller should assure
+   that this limit is not exceeded.  If it is exceeded, then gzprintf() will
+   return an error (0) with nothing written.  In this case, there may also be a
+   buffer overflow with unpredictable consequences, which is possible only if
+   zlib was compiled with the insecure functions sprintf() or vsprintf()
+   because the secure snprintf() or vsnprintf() functions were not available.
+   This can be determined using zlibCompileFlags().
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+     Writes the given null-terminated string to the compressed file, excluding
+   the terminating null character.
+
+     gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+     Reads bytes from the compressed file until len-1 characters are read, or a
+   newline character is read and transferred to buf, or an end-of-file
+   condition is encountered.  If any characters are read or if len == 1, the
+   string is terminated with a null character.  If no characters are read due
+   to an end-of-file or len < 1, then the buffer is left untouched.
+
+     gzgets returns buf which is a null-terminated string, or it returns NULL
+   for end-of-file or in case of error.  If there was an error, the contents at
+   buf are indeterminate.
+*/
+
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+/*
+     Writes c, converted to an unsigned char, into the compressed file.  gzputc
+   returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+/*
+     Reads one byte from the compressed file.  gzgetc returns this byte or -1
+   in case of end of file or error.  This is implemented as a macro for speed.
+   As such, it does not do all of the checking the other functions do.  I.e.
+   it does not check to see if file is NULL, nor whether the structure file
+   points to has been clobbered or not.
+*/
+
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+/*
+     Push one character back onto the stream to be read as the first character
+   on the next read.  At least one character of push-back is allowed.
+   gzungetc() returns the character pushed, or -1 on failure.  gzungetc() will
+   fail if c is -1, and may fail if a character has been pushed but not read
+   yet.  If gzungetc is used immediately after gzopen or gzdopen, at least the
+   output buffer size of pushed characters is allowed.  (See gzbuffer above.)
+   The pushed character will be discarded if the stream is repositioned with
+   gzseek() or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+/*
+     Flushes all pending output into the compressed file.  The parameter flush
+   is as in the deflate() function.  The return value is the zlib error number
+   (see function gzerror below).  gzflush is only permitted when writing.
+
+     If the flush parameter is Z_FINISH, the remaining data is written and the
+   gzip stream is completed in the output.  If gzwrite() is called again, a new
+   gzip stream will be started in the output.  gzread() is able to read such
+   concatenated gzip streams.
+
+     gzflush should be called only when strictly necessary because it will
+   degrade compression if called too often.
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+                                   z_off_t offset, int whence));
+
+     Sets the starting position for the next gzread or gzwrite on the given
+   compressed file.  The offset represents a number of bytes in the
+   uncompressed data stream.  The whence parameter is defined as in lseek(2);
+   the value SEEK_END is not supported.
+
+     If the file is opened for reading, this function is emulated but can be
+   extremely slow.  If the file is opened for writing, only forward seeks are
+   supported; gzseek then compresses a sequence of zeroes up to the new
+   starting position.
+
+     gzseek returns the resulting offset location as measured in bytes from
+   the beginning of the uncompressed stream, or -1 in case of error, in
+   particular if the file is opened for writing and the new starting position
+   would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
+/*
+     Rewinds the given file. This function is supported only for reading.
+
+     gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
+
+     Returns the starting position for the next gzread or gzwrite on the given
+   compressed file.  This position represents a number of bytes in the
+   uncompressed data stream, and is zero when starting, even if appending or
+   reading a gzip stream from the middle of a file using gzdopen().
+
+     gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+
+     Returns the current offset in the file being read or written.  This offset
+   includes the count of bytes that precede the gzip stream, for example when
+   appending or when using gzdopen() for reading.  When reading, the offset
+   does not include as yet unused buffered input.  This information can be used
+   for a progress indicator.  On error, gzoffset() returns -1.
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+     Returns true (1) if the end-of-file indicator has been set while reading,
+   false (0) otherwise.  Note that the end-of-file indicator is set only if the
+   read tried to go past the end of the input, but came up short.  Therefore,
+   just like feof(), gzeof() may return false even if there is no more data to
+   read, in the event that the last read request was for the exact number of
+   bytes remaining in the input file.  This will happen if the input file size
+   is an exact multiple of the buffer size.
+
+     If gzeof() returns true, then the read functions will return no more data,
+   unless the end-of-file indicator is reset by gzclearerr() and the input file
+   has grown since the previous end of file was detected.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+     Returns true (1) if file is being copied directly while reading, or false
+   (0) if file is a gzip stream being decompressed.
+
+     If the input file is empty, gzdirect() will return true, since the input
+   does not contain a gzip stream.
+
+     If gzdirect() is used immediately after gzopen() or gzdopen() it will
+   cause buffers to be allocated to allow reading the file to determine if it
+   is a gzip file.  Therefore if gzbuffer() is used, it should be called before
+   gzdirect().
+
+     When writing, gzdirect() returns true (1) if transparent writing was
+   requested ("wT" for the gzopen() mode), or false (0) otherwise.  (Note:
+   gzdirect() is not needed when writing.  Transparent writing must be
+   explicitly requested, so the application already knows the answer.  When
+   linking statically, using gzdirect() will include all of the zlib code for
+   gzip file reading and decompression, which may not be desired.)
+*/
+
+ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
+/*
+     Flushes all pending output if necessary, closes the compressed file and
+   deallocates the (de)compression state.  Note that once file is closed, you
+   cannot call gzerror with file, since its structures have been deallocated.
+   gzclose must not be called more than once on the same file, just as free
+   must not be called more than once on the same allocation.
+
+     gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+   file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+   last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+/*
+     Same as gzclose(), but gzclose_r() is only for use when reading, and
+   gzclose_w() is only for use when writing or appending.  The advantage to
+   using these instead of gzclose() is that they avoid linking in zlib
+   compression or decompression code that is not used when only reading or only
+   writing respectively.  If gzclose() is used, then both compression and
+   decompression code will be included the application when linking to a static
+   zlib library.
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+     Returns the error message for the last error which occurred on the given
+   compressed file.  errnum is set to zlib error number.  If an error occurred
+   in the file system and not in the compression library, errnum is set to
+   Z_ERRNO and the application may consult errno to get the exact error code.
+
+     The application must not modify the returned string.  Future calls to
+   this function may invalidate the previously returned string.  If file is
+   closed, then the string previously returned by gzerror will no longer be
+   available.
+
+     gzerror() should be used to distinguish errors from end-of-file for those
+   functions above that do not distinguish those cases in their return values.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+     Clears the error and end-of-file flags for file.  This is analogous to the
+   clearerr() function in stdio.  This is useful for continuing to read a gzip
+   file that is being written concurrently.
+*/
+
+#endif /* !Z_SOLO */
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the compression
+   library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum.  If buf is Z_NULL, this function returns the
+   required initial value for the checksum.
+
+     An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed
+   much faster.
+
+   Usage example:
+
+     uLong adler = adler32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+ZEXTERN uLong ZEXPORT adler32_z OF((uLong adler, const Bytef *buf,
+                                    z_size_t len));
+/*
+     Same as adler32(), but with a size_t length.
+*/
+
+/*
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+                                          z_off_t len2));
+
+     Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
+   and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+   each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.  Note
+   that the z_off_t type (like off_t) is a signed integer.  If len2 is
+   negative, the result has no meaning or utility.
+*/
+
+ZEXTERN uLong ZEXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
+/*
+     Update a running CRC-32 with the bytes buf[0..len-1] and return the
+   updated CRC-32.  If buf is Z_NULL, this function returns the required
+   initial value for the crc.  Pre- and post-conditioning (one's complement) is
+   performed within this function so it shouldn't be done by the application.
+
+   Usage example:
+
+     uLong crc = crc32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+
+ZEXTERN uLong ZEXPORT crc32_z OF((uLong adler, const Bytef *buf,
+                                  z_size_t len));
+/*
+     Same as crc32(), but with a size_t length.
+*/
+
+/*
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+     Combine two CRC-32 check values into one.  For two sequences of bytes,
+   seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+   calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
+   check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+   len2.
+*/
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy, const char *version,
+                                      int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int  windowBits,
+                                      const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+                                         unsigned char FAR *window,
+                                         const char *version,
+                                         int stream_size));
+#ifdef Z_PREFIX_SET
+#  define z_deflateInit(strm, level) \
+          deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_inflateInit(strm) \
+          inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+          deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                        (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_inflateInit2(strm, windowBits) \
+          inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                        (int)sizeof(z_stream))
+#  define z_inflateBackInit(strm, windowBits, window) \
+          inflateBackInit_((strm), (windowBits), (window), \
+                           ZLIB_VERSION, (int)sizeof(z_stream))
+#else
+#  define deflateInit(strm, level) \
+          deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define inflateInit(strm) \
+          inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+          deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                        (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define inflateInit2(strm, windowBits) \
+          inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                        (int)sizeof(z_stream))
+#  define inflateBackInit(strm, windowBits, window) \
+          inflateBackInit_((strm), (windowBits), (window), \
+                           ZLIB_VERSION, (int)sizeof(z_stream))
+#endif
+
+#ifndef Z_SOLO
+
+/* gzgetc() macro and its supporting function and exposed data structure.  Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro.  The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously.  They can
+ * only be used by the gzgetc() macro.  You have been warned.
+ */
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#  define z_gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g))
+#else
+#  define gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g))
+#endif
+
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+   ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+   ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+   ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+   ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
+   ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
+#endif
+
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+#  ifdef Z_PREFIX_SET
+#    define z_gzopen z_gzopen64
+#    define z_gzseek z_gzseek64
+#    define z_gztell z_gztell64
+#    define z_gzoffset z_gzoffset64
+#    define z_adler32_combine z_adler32_combine64
+#    define z_crc32_combine z_crc32_combine64
+#  else
+#    define gzopen gzopen64
+#    define gzseek gzseek64
+#    define gztell gztell64
+#    define gzoffset gzoffset64
+#    define adler32_combine adler32_combine64
+#    define crc32_combine crc32_combine64
+#  endif
+#  ifndef Z_LARGE64
+     ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+     ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
+     ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
+     ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
+     ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+     ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+#  endif
+#else
+   ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
+   ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
+   ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
+   ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#endif
+
+#else /* Z_SOLO */
+
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+
+#endif /* !Z_SOLO */
+
+/* undocumented functions */
+ZEXTERN const char   * ZEXPORT zError           OF((int));
+ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp));
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table    OF((void));
+ZEXTERN int            ZEXPORT inflateUndermine OF((z_streamp, int));
+ZEXTERN int            ZEXPORT inflateValidate OF((z_streamp, int));
+ZEXTERN unsigned long  ZEXPORT inflateCodesUsed OF ((z_streamp));
+ZEXTERN int            ZEXPORT inflateResetKeep OF((z_streamp));
+ZEXTERN int            ZEXPORT deflateResetKeep OF((z_streamp));
+#if (defined(_WIN32) || defined(__CYGWIN__)) && !defined(Z_SOLO)
+ZEXTERN gzFile         ZEXPORT gzopen_w OF((const wchar_t *path,
+                                            const char *mode));
+#endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+ZEXTERN int            ZEXPORTVA gzvprintf Z_ARG((gzFile file,
+                                                  const char *format,
+                                                  va_list va));
+#  endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
diff --git a/deps/SZ/zlib/zutil.c b/deps/SZ/zlib/zutil.c
new file mode 100644
index 0000000000000000000000000000000000000000..a76c6b0c7e557f8c29cfcf58a5ef9ef79c5e4e8a
--- /dev/null
+++ b/deps/SZ/zlib/zutil.c
@@ -0,0 +1,325 @@
+/* zutil.c -- target dependent utility functions for the compression library
+ * Copyright (C) 1995-2017 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#include "zutil.h"
+#ifndef Z_SOLO
+#  include "gzguts.h"
+#endif
+
+z_const char * const z_errmsg[10] = {
+    (z_const char *)"need dictionary",     /* Z_NEED_DICT       2  */
+    (z_const char *)"stream end",          /* Z_STREAM_END      1  */
+    (z_const char *)"",                    /* Z_OK              0  */
+    (z_const char *)"file error",          /* Z_ERRNO         (-1) */
+    (z_const char *)"stream error",        /* Z_STREAM_ERROR  (-2) */
+    (z_const char *)"data error",          /* Z_DATA_ERROR    (-3) */
+    (z_const char *)"insufficient memory", /* Z_MEM_ERROR     (-4) */
+    (z_const char *)"buffer error",        /* Z_BUF_ERROR     (-5) */
+    (z_const char *)"incompatible version",/* Z_VERSION_ERROR (-6) */
+    (z_const char *)""
+};
+
+
+const char * ZEXPORT zlibVersion()
+{
+    return ZLIB_VERSION;
+}
+
+uLong ZEXPORT zlibCompileFlags()
+{
+    uLong flags;
+
+    flags = 0;
+    switch ((int)(sizeof(uInt))) {
+    case 2:     break;
+    case 4:     flags += 1;     break;
+    case 8:     flags += 2;     break;
+    default:    flags += 3;
+    }
+    switch ((int)(sizeof(uLong))) {
+    case 2:     break;
+    case 4:     flags += 1 << 2;        break;
+    case 8:     flags += 2 << 2;        break;
+    default:    flags += 3 << 2;
+    }
+    switch ((int)(sizeof(voidpf))) {
+    case 2:     break;
+    case 4:     flags += 1 << 4;        break;
+    case 8:     flags += 2 << 4;        break;
+    default:    flags += 3 << 4;
+    }
+    switch ((int)(sizeof(z_off_t))) {
+    case 2:     break;
+    case 4:     flags += 1 << 6;        break;
+    case 8:     flags += 2 << 6;        break;
+    default:    flags += 3 << 6;
+    }
+#ifdef ZLIB_DEBUG
+    flags += 1 << 8;
+#endif
+#if defined(ASMV) || defined(ASMINF)
+    flags += 1 << 9;
+#endif
+#ifdef ZLIB_WINAPI
+    flags += 1 << 10;
+#endif
+#ifdef BUILDFIXED
+    flags += 1 << 12;
+#endif
+#ifdef DYNAMIC_CRC_TABLE
+    flags += 1 << 13;
+#endif
+#ifdef NO_GZCOMPRESS
+    flags += 1L << 16;
+#endif
+#ifdef NO_GZIP
+    flags += 1L << 17;
+#endif
+#ifdef PKZIP_BUG_WORKAROUND
+    flags += 1L << 20;
+#endif
+#ifdef FASTEST
+    flags += 1L << 21;
+#endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifdef NO_vsnprintf
+    flags += 1L << 25;
+#    ifdef HAS_vsprintf_void
+    flags += 1L << 26;
+#    endif
+#  else
+#    ifdef HAS_vsnprintf_void
+    flags += 1L << 26;
+#    endif
+#  endif
+#else
+    flags += 1L << 24;
+#  ifdef NO_snprintf
+    flags += 1L << 25;
+#    ifdef HAS_sprintf_void
+    flags += 1L << 26;
+#    endif
+#  else
+#    ifdef HAS_snprintf_void
+    flags += 1L << 26;
+#    endif
+#  endif
+#endif
+    return flags;
+}
+
+#ifdef ZLIB_DEBUG
+#include <stdlib.h>
+#  ifndef verbose
+#    define verbose 0
+#  endif
+int ZLIB_INTERNAL z_verbose = verbose;
+
+void ZLIB_INTERNAL z_error (m)
+    char *m;
+{
+    fprintf(stderr, "%s\n", m);
+    exit(1);
+}
+#endif
+
+/* exported to allow conversion of error code to string for compress() and
+ * uncompress()
+ */
+const char * ZEXPORT zError(err)
+    int err;
+{
+    return ERR_MSG(err);
+}
+
+#if defined(_WIN32_WCE)
+    /* The Microsoft C Run-Time Library for Windows CE doesn't have
+     * errno.  We define it as a global variable to simplify porting.
+     * Its value is always 0 and should not be used.
+     */
+    int errno = 0;
+#endif
+
+#ifndef HAVE_MEMCPY
+
+void ZLIB_INTERNAL zmemcpy(dest, source, len)
+    Bytef* dest;
+    const Bytef* source;
+    uInt  len;
+{
+    if (len == 0) return;
+    do {
+        *dest++ = *source++; /* ??? to be unrolled */
+    } while (--len != 0);
+}
+
+int ZLIB_INTERNAL zmemcmp(s1, s2, len)
+    const Bytef* s1;
+    const Bytef* s2;
+    uInt  len;
+{
+    uInt j;
+
+    for (j = 0; j < len; j++) {
+        if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1;
+    }
+    return 0;
+}
+
+void ZLIB_INTERNAL zmemzero(dest, len)
+    Bytef* dest;
+    uInt  len;
+{
+    if (len == 0) return;
+    do {
+        *dest++ = 0;  /* ??? to be unrolled */
+    } while (--len != 0);
+}
+#endif
+
+#ifndef Z_SOLO
+
+#ifdef SYS16BIT
+
+#ifdef __TURBOC__
+/* Turbo C in 16-bit mode */
+
+#  define MY_ZCALLOC
+
+/* Turbo C malloc() does not allow dynamic allocation of 64K bytes
+ * and farmalloc(64K) returns a pointer with an offset of 8, so we
+ * must fix the pointer. Warning: the pointer must be put back to its
+ * original form in order to free it, use zcfree().
+ */
+
+#define MAX_PTR 10
+/* 10*64K = 640K */
+
+local int next_ptr = 0;
+
+typedef struct ptr_table_s {
+    voidpf org_ptr;
+    voidpf new_ptr;
+} ptr_table;
+
+local ptr_table table[MAX_PTR];
+/* This table is used to remember the original form of pointers
+ * to large buffers (64K). Such pointers are normalized with a zero offset.
+ * Since MSDOS is not a preemptive multitasking OS, this table is not
+ * protected from concurrent access. This hack doesn't work anyway on
+ * a protected system like OS/2. Use Microsoft C instead.
+ */
+
+voidpf ZLIB_INTERNAL zcalloc (voidpf opaque, unsigned items, unsigned size)
+{
+    voidpf buf;
+    ulg bsize = (ulg)items*size;
+
+    (void)opaque;
+
+    /* If we allocate less than 65520 bytes, we assume that farmalloc
+     * will return a usable pointer which doesn't have to be normalized.
+     */
+    if (bsize < 65520L) {
+        buf = farmalloc(bsize);
+        if (*(ush*)&buf != 0) return buf;
+    } else {
+        buf = farmalloc(bsize + 16L);
+    }
+    if (buf == NULL || next_ptr >= MAX_PTR) return NULL;
+    table[next_ptr].org_ptr = buf;
+
+    /* Normalize the pointer to seg:0 */
+    *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4;
+    *(ush*)&buf = 0;
+    table[next_ptr++].new_ptr = buf;
+    return buf;
+}
+
+void ZLIB_INTERNAL zcfree (voidpf opaque, voidpf ptr)
+{
+    int n;
+
+    (void)opaque;
+
+    if (*(ush*)&ptr != 0) { /* object < 64K */
+        farfree(ptr);
+        return;
+    }
+    /* Find the original pointer */
+    for (n = 0; n < next_ptr; n++) {
+        if (ptr != table[n].new_ptr) continue;
+
+        farfree(table[n].org_ptr);
+        while (++n < next_ptr) {
+            table[n-1] = table[n];
+        }
+        next_ptr--;
+        return;
+    }
+    Assert(0, "zcfree: ptr not found");
+}
+
+#endif /* __TURBOC__ */
+
+
+#ifdef M_I86
+/* Microsoft C in 16-bit mode */
+
+#  define MY_ZCALLOC
+
+#if (!defined(_MSC_VER) || (_MSC_VER <= 600))
+#  define _halloc  halloc
+#  define _hfree   hfree
+#endif
+
+voidpf ZLIB_INTERNAL zcalloc (voidpf opaque, uInt items, uInt size)
+{
+    (void)opaque;
+    return _halloc((long)items, size);
+}
+
+void ZLIB_INTERNAL zcfree (voidpf opaque, voidpf ptr)
+{
+    (void)opaque;
+    _hfree(ptr);
+}
+
+#endif /* M_I86 */
+
+#endif /* SYS16BIT */
+
+
+#ifndef MY_ZCALLOC /* Any system without a special alloc function */
+
+#ifndef STDC
+extern voidp  malloc OF((uInt size));
+extern voidp  calloc OF((uInt items, uInt size));
+extern void   free   OF((voidpf ptr));
+#endif
+
+voidpf ZLIB_INTERNAL zcalloc (opaque, items, size)
+    voidpf opaque;
+    unsigned items;
+    unsigned size;
+{
+    (void)opaque;
+    return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) :
+                              (voidpf)calloc(items, size);
+}
+
+void ZLIB_INTERNAL zcfree (opaque, ptr)
+    voidpf opaque;
+    voidpf ptr;
+{
+    (void)opaque;
+    free(ptr);
+}
+
+#endif /* MY_ZCALLOC */
+
+#endif /* !Z_SOLO */
diff --git a/deps/SZ/zlib/zutil.h b/deps/SZ/zlib/zutil.h
new file mode 100644
index 0000000000000000000000000000000000000000..b079ea6a80f5abd23a6b2451d6eaee50ceda969b
--- /dev/null
+++ b/deps/SZ/zlib/zutil.h
@@ -0,0 +1,271 @@
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZUTIL_H
+#define ZUTIL_H
+
+#ifdef HAVE_HIDDEN
+#  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define ZLIB_INTERNAL
+#endif
+
+#include "zlib.h"
+
+#if defined(STDC) && !defined(Z_SOLO)
+#  if !(defined(_WIN32_WCE) && defined(_MSC_VER))
+#    include <stddef.h>
+#  endif
+#  include <string.h>
+#  include <stdlib.h>
+#endif
+
+#ifdef Z_SOLO
+   typedef long ptrdiff_t;  /* guess -- will be caught if guess is wrong */
+#endif
+
+#ifndef local
+#  define local static
+#endif
+/* since "static" is used to mean two completely different things in C, we
+   define "local" for the non-static meaning of "static", for readability
+   (compile with -Dlocal if your debugger can't find static symbols) */
+
+typedef unsigned char  uch;
+typedef uch FAR uchf;
+typedef unsigned short ush;
+typedef ush FAR ushf;
+typedef unsigned long  ulg;
+
+extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
+/* (size given to avoid silly warnings with Visual C++) */
+
+#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
+
+#define ERR_RETURN(strm,err) \
+  return (strm->msg = ERR_MSG(err), (err))
+/* To be used only when the state is known to be valid */
+
+        /* common constants */
+
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+/* default memLevel */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES    2
+/* The three kinds of block type */
+
+#define MIN_MATCH  3
+#define MAX_MATCH  258
+/* The minimum and maximum match lengths */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+        /* target dependencies */
+
+#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
+#  define OS_CODE  0x00
+#  ifndef Z_SOLO
+#    if defined(__TURBOC__) || defined(__BORLANDC__)
+#      if (__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
+         /* Allow compilation with ANSI keywords only enabled */
+         void _Cdecl farfree( void *block );
+         void *_Cdecl farmalloc( unsigned long nbytes );
+#      else
+#        include <alloc.h>
+#      endif
+#    else /* MSC or DJGPP */
+#      include <malloc.h>
+#    endif
+#  endif
+#endif
+
+#ifdef AMIGA
+#  define OS_CODE  1
+#endif
+
+#if defined(VAXC) || defined(VMS)
+#  define OS_CODE  2
+#  define F_OPEN(name, mode) \
+     fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
+#endif
+
+#ifdef __370__
+#  if __TARGET_LIB__ < 0x20000000
+#    define OS_CODE 4
+#  elif __TARGET_LIB__ < 0x40000000
+#    define OS_CODE 11
+#  else
+#    define OS_CODE 8
+#  endif
+#endif
+
+#if defined(ATARI) || defined(atarist)
+#  define OS_CODE  5
+#endif
+
+#ifdef OS2
+#  define OS_CODE  6
+#  if defined(M_I86) && !defined(Z_SOLO)
+#    include <malloc.h>
+#  endif
+#endif
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+#  define OS_CODE  7
+#  ifndef Z_SOLO
+#    if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
+#      include <unix.h> /* for fdopen */
+#    else
+#      ifndef fdopen
+#        define fdopen(fd,mode) NULL /* No fdopen() */
+#      endif
+#    endif
+#  endif
+#endif
+
+#ifdef __acorn
+#  define OS_CODE 13
+#endif
+
+#if defined(WIN32) && !defined(__CYGWIN__)
+#  define OS_CODE  10
+#endif
+
+#ifdef _BEOS_
+#  define OS_CODE  16
+#endif
+
+#ifdef __TOS_OS400__
+#  define OS_CODE 18
+#endif
+
+#ifdef __APPLE__
+#  define OS_CODE 19
+#endif
+
+#if defined(_BEOS_) || defined(RISCOS)
+#  define fdopen(fd,mode) NULL /* No fdopen() */
+#endif
+
+#if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX
+#  if defined(_WIN32_WCE)
+#    define fdopen(fd,mode) NULL /* No fdopen() */
+#    ifndef _PTRDIFF_T_DEFINED
+       typedef int ptrdiff_t;
+#      define _PTRDIFF_T_DEFINED
+#    endif
+#  else
+#    define fdopen(fd,type)  _fdopen(fd,type)
+#  endif
+#endif
+
+#if defined(__BORLANDC__) && !defined(MSDOS)
+  #pragma warn -8004
+  #pragma warn -8008
+  #pragma warn -8066
+#endif
+
+/* provide prototypes for these when building zlib without LFS */
+#if !defined(_WIN32) && \
+    (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0)
+    ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+    ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+#endif
+
+        /* common defaults */
+
+#ifndef OS_CODE
+#  define OS_CODE  3     /* assume Unix */
+#endif
+
+#ifndef F_OPEN
+#  define F_OPEN(name, mode) fopen((name), (mode))
+#endif
+
+         /* functions */
+
+#if defined(pyr) || defined(Z_SOLO)
+#  define NO_MEMCPY
+#endif
+#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
+ /* Use our own functions for small and medium model with MSC <= 5.0.
+  * You may have to use the same strategy for Borland C (untested).
+  * The __SC__ check is for Symantec.
+  */
+#  define NO_MEMCPY
+#endif
+#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
+#  define HAVE_MEMCPY
+#endif
+#ifdef HAVE_MEMCPY
+#  ifdef SMALL_MEDIUM /* MSDOS small or medium model */
+#    define zmemcpy _fmemcpy
+#    define zmemcmp _fmemcmp
+#    define zmemzero(dest, len) _fmemset(dest, 0, len)
+#  else
+#    define zmemcpy memcpy
+#    define zmemcmp memcmp
+#    define zmemzero(dest, len) memset(dest, 0, len)
+#  endif
+#else
+   void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len));
+   int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len));
+   void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len));
+#endif
+
+/* Diagnostic functions */
+#ifdef ZLIB_DEBUG
+#  include <stdio.h>
+   extern int ZLIB_INTERNAL z_verbose;
+   extern void ZLIB_INTERNAL z_error OF((char *m));
+#  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
+#  define Trace(x) {if (z_verbose>=0) fprintf x ;}
+#  define Tracev(x) {if (z_verbose>0) fprintf x ;}
+#  define Tracevv(x) {if (z_verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+
+#ifndef Z_SOLO
+   voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items,
+                                    unsigned size));
+   void ZLIB_INTERNAL zcfree  OF((voidpf opaque, voidpf ptr));
+#endif
+
+#define ZALLOC(strm, items, size) \
+           (*((strm)->zalloc))((strm)->opaque, (items), (size))
+#define ZFREE(strm, addr)  (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
+#define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
+
+/* Reverse the bytes in a 32-bit value */
+#define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+                    (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
+#endif /* ZUTIL_H */
diff --git a/deps/SZ/zstd/CMakeLists.txt b/deps/SZ/zstd/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..59665ce0ac1ce52978c56a2ae8865f4e6c5eaff6
--- /dev/null
+++ b/deps/SZ/zstd/CMakeLists.txt
@@ -0,0 +1,52 @@
+add_library(zstd
+  ./common/entropy_common.c
+  ./common/pool.c
+  ./common/threading.c
+  ./common/debug.c
+  ./common/xxhash.c
+  ./common/fse_decompress.c
+  ./common/zstd_common.c
+  ./common/error_private.c
+  ./compress/zstd_ldm.c
+  ./compress/zstd_lazy.c
+  ./compress/huf_compress.c
+  ./compress/zstd_opt.c
+  ./compress/zstd_double_fast.c
+  ./compress/zstd_compress.c
+  ./compress/zstd_fast.c
+  ./compress/fse_compress.c
+  ./compress/zstdmt_compress.c
+  ./compress/hist.c
+  ./decompress/zstd_decompress.c
+  ./decompress/huf_decompress.c
+  ./deprecated/zbuff_common.c
+  ./deprecated/zbuff_compress.c
+  ./deprecated/zbuff_decompress.c
+  ./legacy/zstd_v05.c
+  ./legacy/zstd_v04.c
+  ./legacy/zstd_v06.c
+  ./legacy/zstd_v07.c
+  ./legacy/zstd_v03.c
+  ./legacy/zstd_v02.c
+  ./legacy/zstd_v01.c
+  ./dictBuilder/cover.c
+  ./dictBuilder/divsufsort.c
+  ./dictBuilder/zdict.c
+  )
+
+target_include_directories(zstd
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/common
+    ${CMAKE_CURRENT_SOURCE_DIR}/compress
+    ${CMAKE_CURRENT_SOURCE_DIR}/decompress
+    ${CMAKE_CURRENT_SOURCE_DIR}/deprecated
+    ${CMAKE_CURRENT_SOURCE_DIR}/dictBuilder
+    ${CMAKE_CURRENT_SOURCE_DIR}/dll
+    ${CMAKE_CURRENT_SOURCE_DIR}/legacy
+  )
+
+
+
+
diff --git a/deps/SZ/zstd/LICENSE b/deps/SZ/zstd/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a793a802892567f17d464a831e2e531dc8833f55
--- /dev/null
+++ b/deps/SZ/zstd/LICENSE
@@ -0,0 +1,30 @@
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/deps/SZ/zstd/Makefile.am b/deps/SZ/zstd/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..64713cbd2d0711f495c00abcf2fdd6c671f638fd
--- /dev/null
+++ b/deps/SZ/zstd/Makefile.am
@@ -0,0 +1,69 @@
+AUTOMAKE_OPTIONS=foreign
+include_HEADERS=./compress/zstdmt_compress.h \
+		./compress/zstd_opt.h \
+		./compress/zstd_ldm.h \
+		./compress/zstd_compress_internal.h \
+		./compress/hist.h \
+		./compress/zstd_fast.h \
+		./compress/zstd_double_fast.h \
+		./compress/zstd_lazy.h \
+		./common/fse.h \
+		./common/bitstream.h \
+		./common/mem.h \
+		./common/zstd_errors.h \
+		./common/compiler.h \
+		./common/debug.h \
+		./common/huf.h \
+		./common/zstd_internal.h \
+		./common/xxhash.h \
+		./common/cpu.h \
+		./common/pool.h \
+		./common/threading.h \
+		./common/error_private.h \
+		./deprecated/zbuff.h \
+		./dictBuilder/zdict.h \
+		./dictBuilder/divsufsort.h \
+		./legacy/zstd_v07.h \
+		./legacy/zstd_v02.h \
+		./legacy/zstd_v04.h \
+		./legacy/zstd_legacy.h \
+		./legacy/zstd_v06.h \
+		./legacy/zstd_v05.h \
+		./legacy/zstd_v01.h \
+		./legacy/zstd_v03.h \
+		./zstd.h
+lib_LTLIBRARIES=libzstd.la
+libzstd_la_CFLAGS=-I./ -I./compress -I./common -I./deprecated -I./dictBuilder -I./legacy
+libzstd_la_SOURCES=./decompress/zstd_decompress.c \
+		./decompress/huf_decompress.c \
+		./compress/zstd_lazy.c \
+		./compress/zstdmt_compress.c \
+		./compress/zstd_double_fast.c \
+		./compress/zstd_fast.c \
+		./compress/hist.c \
+		./compress/fse_compress.c \
+		./compress/zstd_opt.c \
+		./compress/zstd_compress.c \
+		./compress/huf_compress.c \
+		./compress/zstd_ldm.c \
+		./common/xxhash.c \
+		./common/fse_decompress.c \
+		./common/pool.c \
+		./common/zstd_common.c \
+		./common/error_private.c \
+		./common/debug.c \
+		./common/threading.c \
+		./common/entropy_common.c \
+		./deprecated/zbuff_compress.c \
+		./deprecated/zbuff_decompress.c \
+		./deprecated/zbuff_common.c \
+		./dictBuilder/zdict.c \
+		./dictBuilder/divsufsort.c \
+		./dictBuilder/cover.c \
+		./legacy/zstd_v03.c \
+		./legacy/zstd_v07.c \
+		./legacy/zstd_v06.c \
+		./legacy/zstd_v01.c \
+		./legacy/zstd_v02.c \
+		./legacy/zstd_v04.c \
+		./legacy/zstd_v05.c
diff --git a/deps/SZ/zstd/Makefile.in b/deps/SZ/zstd/Makefile.in
new file mode 100644
index 0000000000000000000000000000000000000000..f51b6e105ee5c5e565e3b10113153057f39d08bd
--- /dev/null
+++ b/deps/SZ/zstd/Makefile.in
@@ -0,0 +1,1284 @@
+# Makefile.in generated by automake 1.16.2 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2020 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = zstd
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(include_HEADERS) \
+	$(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libzstd_la_LIBADD =
+am__dirstamp = $(am__leading_dot)dirstamp
+am_libzstd_la_OBJECTS = ./decompress/libzstd_la-zstd_decompress.lo \
+	./decompress/libzstd_la-huf_decompress.lo \
+	./compress/libzstd_la-zstd_lazy.lo \
+	./compress/libzstd_la-zstdmt_compress.lo \
+	./compress/libzstd_la-zstd_double_fast.lo \
+	./compress/libzstd_la-zstd_fast.lo \
+	./compress/libzstd_la-hist.lo \
+	./compress/libzstd_la-fse_compress.lo \
+	./compress/libzstd_la-zstd_opt.lo \
+	./compress/libzstd_la-zstd_compress.lo \
+	./compress/libzstd_la-huf_compress.lo \
+	./compress/libzstd_la-zstd_ldm.lo \
+	./common/libzstd_la-xxhash.lo \
+	./common/libzstd_la-fse_decompress.lo \
+	./common/libzstd_la-pool.lo ./common/libzstd_la-zstd_common.lo \
+	./common/libzstd_la-error_private.lo \
+	./common/libzstd_la-debug.lo ./common/libzstd_la-threading.lo \
+	./common/libzstd_la-entropy_common.lo \
+	./deprecated/libzstd_la-zbuff_compress.lo \
+	./deprecated/libzstd_la-zbuff_decompress.lo \
+	./deprecated/libzstd_la-zbuff_common.lo \
+	./dictBuilder/libzstd_la-zdict.lo \
+	./dictBuilder/libzstd_la-divsufsort.lo \
+	./dictBuilder/libzstd_la-cover.lo \
+	./legacy/libzstd_la-zstd_v03.lo \
+	./legacy/libzstd_la-zstd_v07.lo \
+	./legacy/libzstd_la-zstd_v06.lo \
+	./legacy/libzstd_la-zstd_v01.lo \
+	./legacy/libzstd_la-zstd_v02.lo \
+	./legacy/libzstd_la-zstd_v04.lo \
+	./legacy/libzstd_la-zstd_v05.lo
+libzstd_la_OBJECTS = $(am_libzstd_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libzstd_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libzstd_la_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = ./common/$(DEPDIR)/libzstd_la-debug.Plo \
+	./common/$(DEPDIR)/libzstd_la-entropy_common.Plo \
+	./common/$(DEPDIR)/libzstd_la-error_private.Plo \
+	./common/$(DEPDIR)/libzstd_la-fse_decompress.Plo \
+	./common/$(DEPDIR)/libzstd_la-pool.Plo \
+	./common/$(DEPDIR)/libzstd_la-threading.Plo \
+	./common/$(DEPDIR)/libzstd_la-xxhash.Plo \
+	./common/$(DEPDIR)/libzstd_la-zstd_common.Plo \
+	./compress/$(DEPDIR)/libzstd_la-fse_compress.Plo \
+	./compress/$(DEPDIR)/libzstd_la-hist.Plo \
+	./compress/$(DEPDIR)/libzstd_la-huf_compress.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_compress.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_fast.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstd_opt.Plo \
+	./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Plo \
+	./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Plo \
+	./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Plo \
+	./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Plo \
+	./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Plo \
+	./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Plo \
+	./dictBuilder/$(DEPDIR)/libzstd_la-cover.Plo \
+	./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Plo \
+	./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Plo \
+	./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Plo
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libzstd_la_SOURCES)
+DIST_SOURCES = $(libzstd_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+HEADERS = $(include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FC = @FC@
+FCFLAGS = @FCFLAGS@
+FGREP = @FGREP@
+GREP = @GREP@
+GSL_CFLAGS = @GSL_CFLAGS@
+GSL_CONFIG = @GSL_CONFIG@
+GSL_HDR = @GSL_HDR@
+GSL_LIB = @GSL_LIB@
+GSL_LIBS = @GSL_LIBS@
+GSL_STATIC = @GSL_STATIC@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OPENMP_FLAGS = @OPENMP_FLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PASTRI_FLAGS = @PASTRI_FLAGS@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANDOMACCESS_FLAGS = @RANDOMACCESS_FLAGS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TIMECMPR_FLAGS = @TIMECMPR_FLAGS@
+VERSION = @VERSION@
+WRITESTATS_FLAGS = @WRITESTATS_FLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_FC = @ac_ct_FC@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AUTOMAKE_OPTIONS = foreign
+include_HEADERS = ./compress/zstdmt_compress.h \
+		./compress/zstd_opt.h \
+		./compress/zstd_ldm.h \
+		./compress/zstd_compress_internal.h \
+		./compress/hist.h \
+		./compress/zstd_fast.h \
+		./compress/zstd_double_fast.h \
+		./compress/zstd_lazy.h \
+		./common/fse.h \
+		./common/bitstream.h \
+		./common/mem.h \
+		./common/zstd_errors.h \
+		./common/compiler.h \
+		./common/debug.h \
+		./common/huf.h \
+		./common/zstd_internal.h \
+		./common/xxhash.h \
+		./common/cpu.h \
+		./common/pool.h \
+		./common/threading.h \
+		./common/error_private.h \
+		./deprecated/zbuff.h \
+		./dictBuilder/zdict.h \
+		./dictBuilder/divsufsort.h \
+		./legacy/zstd_v07.h \
+		./legacy/zstd_v02.h \
+		./legacy/zstd_v04.h \
+		./legacy/zstd_legacy.h \
+		./legacy/zstd_v06.h \
+		./legacy/zstd_v05.h \
+		./legacy/zstd_v01.h \
+		./legacy/zstd_v03.h \
+		./zstd.h
+
+lib_LTLIBRARIES = libzstd.la
+libzstd_la_CFLAGS = -I./ -I./compress -I./common -I./deprecated -I./dictBuilder -I./legacy
+libzstd_la_SOURCES = ./decompress/zstd_decompress.c \
+		./decompress/huf_decompress.c \
+		./compress/zstd_lazy.c \
+		./compress/zstdmt_compress.c \
+		./compress/zstd_double_fast.c \
+		./compress/zstd_fast.c \
+		./compress/hist.c \
+		./compress/fse_compress.c \
+		./compress/zstd_opt.c \
+		./compress/zstd_compress.c \
+		./compress/huf_compress.c \
+		./compress/zstd_ldm.c \
+		./common/xxhash.c \
+		./common/fse_decompress.c \
+		./common/pool.c \
+		./common/zstd_common.c \
+		./common/error_private.c \
+		./common/debug.c \
+		./common/threading.c \
+		./common/entropy_common.c \
+		./deprecated/zbuff_compress.c \
+		./deprecated/zbuff_decompress.c \
+		./deprecated/zbuff_common.c \
+		./dictBuilder/zdict.c \
+		./dictBuilder/divsufsort.c \
+		./dictBuilder/cover.c \
+		./legacy/zstd_v03.c \
+		./legacy/zstd_v07.c \
+		./legacy/zstd_v06.c \
+		./legacy/zstd_v01.c \
+		./legacy/zstd_v02.c \
+		./legacy/zstd_v04.c \
+		./legacy/zstd_v05.c
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign zstd/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign zstd/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+decompress/$(am__dirstamp):
+	@$(MKDIR_P) ./decompress
+	@: > decompress/$(am__dirstamp)
+decompress/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./decompress/$(DEPDIR)
+	@: > decompress/$(DEPDIR)/$(am__dirstamp)
+./decompress/libzstd_la-zstd_decompress.lo:  \
+	decompress/$(am__dirstamp) \
+	decompress/$(DEPDIR)/$(am__dirstamp)
+./decompress/libzstd_la-huf_decompress.lo: decompress/$(am__dirstamp) \
+	decompress/$(DEPDIR)/$(am__dirstamp)
+compress/$(am__dirstamp):
+	@$(MKDIR_P) ./compress
+	@: > compress/$(am__dirstamp)
+compress/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./compress/$(DEPDIR)
+	@: > compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_lazy.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstdmt_compress.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_double_fast.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_fast.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-hist.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-fse_compress.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_opt.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_compress.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-huf_compress.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+./compress/libzstd_la-zstd_ldm.lo: compress/$(am__dirstamp) \
+	compress/$(DEPDIR)/$(am__dirstamp)
+common/$(am__dirstamp):
+	@$(MKDIR_P) ./common
+	@: > common/$(am__dirstamp)
+common/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./common/$(DEPDIR)
+	@: > common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-xxhash.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-fse_decompress.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-pool.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-zstd_common.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-error_private.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-debug.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-threading.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+./common/libzstd_la-entropy_common.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+deprecated/$(am__dirstamp):
+	@$(MKDIR_P) ./deprecated
+	@: > deprecated/$(am__dirstamp)
+deprecated/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./deprecated/$(DEPDIR)
+	@: > deprecated/$(DEPDIR)/$(am__dirstamp)
+./deprecated/libzstd_la-zbuff_compress.lo: deprecated/$(am__dirstamp) \
+	deprecated/$(DEPDIR)/$(am__dirstamp)
+./deprecated/libzstd_la-zbuff_decompress.lo:  \
+	deprecated/$(am__dirstamp) \
+	deprecated/$(DEPDIR)/$(am__dirstamp)
+./deprecated/libzstd_la-zbuff_common.lo: deprecated/$(am__dirstamp) \
+	deprecated/$(DEPDIR)/$(am__dirstamp)
+dictBuilder/$(am__dirstamp):
+	@$(MKDIR_P) ./dictBuilder
+	@: > dictBuilder/$(am__dirstamp)
+dictBuilder/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./dictBuilder/$(DEPDIR)
+	@: > dictBuilder/$(DEPDIR)/$(am__dirstamp)
+./dictBuilder/libzstd_la-zdict.lo: dictBuilder/$(am__dirstamp) \
+	dictBuilder/$(DEPDIR)/$(am__dirstamp)
+./dictBuilder/libzstd_la-divsufsort.lo: dictBuilder/$(am__dirstamp) \
+	dictBuilder/$(DEPDIR)/$(am__dirstamp)
+./dictBuilder/libzstd_la-cover.lo: dictBuilder/$(am__dirstamp) \
+	dictBuilder/$(DEPDIR)/$(am__dirstamp)
+legacy/$(am__dirstamp):
+	@$(MKDIR_P) ./legacy
+	@: > legacy/$(am__dirstamp)
+legacy/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) ./legacy/$(DEPDIR)
+	@: > legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v03.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v07.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v06.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v01.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v02.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v04.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+./legacy/libzstd_la-zstd_v05.lo: legacy/$(am__dirstamp) \
+	legacy/$(DEPDIR)/$(am__dirstamp)
+
+libzstd.la: $(libzstd_la_OBJECTS) $(libzstd_la_DEPENDENCIES) $(EXTRA_libzstd_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libzstd_la_LINK) -rpath $(libdir) $(libzstd_la_OBJECTS) $(libzstd_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+	-rm -f ./common/*.$(OBJEXT)
+	-rm -f ./common/*.lo
+	-rm -f ./compress/*.$(OBJEXT)
+	-rm -f ./compress/*.lo
+	-rm -f ./decompress/*.$(OBJEXT)
+	-rm -f ./decompress/*.lo
+	-rm -f ./deprecated/*.$(OBJEXT)
+	-rm -f ./deprecated/*.lo
+	-rm -f ./dictBuilder/*.$(OBJEXT)
+	-rm -f ./dictBuilder/*.lo
+	-rm -f ./legacy/*.$(OBJEXT)
+	-rm -f ./legacy/*.lo
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-debug.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-entropy_common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-error_private.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-fse_decompress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-pool.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-threading.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-xxhash.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./common/$(DEPDIR)/libzstd_la-zstd_common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-fse_compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-hist.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-huf_compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_fast.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstd_opt.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./dictBuilder/$(DEPDIR)/libzstd_la-cover.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+./decompress/libzstd_la-zstd_decompress.lo: ./decompress/zstd_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./decompress/libzstd_la-zstd_decompress.lo -MD -MP -MF ./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Tpo -c -o ./decompress/libzstd_la-zstd_decompress.lo `test -f './decompress/zstd_decompress.c' || echo '$(srcdir)/'`./decompress/zstd_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Tpo ./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./decompress/zstd_decompress.c' object='./decompress/libzstd_la-zstd_decompress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./decompress/libzstd_la-zstd_decompress.lo `test -f './decompress/zstd_decompress.c' || echo '$(srcdir)/'`./decompress/zstd_decompress.c
+
+./decompress/libzstd_la-huf_decompress.lo: ./decompress/huf_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./decompress/libzstd_la-huf_decompress.lo -MD -MP -MF ./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Tpo -c -o ./decompress/libzstd_la-huf_decompress.lo `test -f './decompress/huf_decompress.c' || echo '$(srcdir)/'`./decompress/huf_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Tpo ./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./decompress/huf_decompress.c' object='./decompress/libzstd_la-huf_decompress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./decompress/libzstd_la-huf_decompress.lo `test -f './decompress/huf_decompress.c' || echo '$(srcdir)/'`./decompress/huf_decompress.c
+
+./compress/libzstd_la-zstd_lazy.lo: ./compress/zstd_lazy.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_lazy.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Tpo -c -o ./compress/libzstd_la-zstd_lazy.lo `test -f './compress/zstd_lazy.c' || echo '$(srcdir)/'`./compress/zstd_lazy.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_lazy.c' object='./compress/libzstd_la-zstd_lazy.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_lazy.lo `test -f './compress/zstd_lazy.c' || echo '$(srcdir)/'`./compress/zstd_lazy.c
+
+./compress/libzstd_la-zstdmt_compress.lo: ./compress/zstdmt_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstdmt_compress.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Tpo -c -o ./compress/libzstd_la-zstdmt_compress.lo `test -f './compress/zstdmt_compress.c' || echo '$(srcdir)/'`./compress/zstdmt_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Tpo ./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstdmt_compress.c' object='./compress/libzstd_la-zstdmt_compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstdmt_compress.lo `test -f './compress/zstdmt_compress.c' || echo '$(srcdir)/'`./compress/zstdmt_compress.c
+
+./compress/libzstd_la-zstd_double_fast.lo: ./compress/zstd_double_fast.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_double_fast.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Tpo -c -o ./compress/libzstd_la-zstd_double_fast.lo `test -f './compress/zstd_double_fast.c' || echo '$(srcdir)/'`./compress/zstd_double_fast.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_double_fast.c' object='./compress/libzstd_la-zstd_double_fast.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_double_fast.lo `test -f './compress/zstd_double_fast.c' || echo '$(srcdir)/'`./compress/zstd_double_fast.c
+
+./compress/libzstd_la-zstd_fast.lo: ./compress/zstd_fast.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_fast.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_fast.Tpo -c -o ./compress/libzstd_la-zstd_fast.lo `test -f './compress/zstd_fast.c' || echo '$(srcdir)/'`./compress/zstd_fast.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_fast.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_fast.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_fast.c' object='./compress/libzstd_la-zstd_fast.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_fast.lo `test -f './compress/zstd_fast.c' || echo '$(srcdir)/'`./compress/zstd_fast.c
+
+./compress/libzstd_la-hist.lo: ./compress/hist.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-hist.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-hist.Tpo -c -o ./compress/libzstd_la-hist.lo `test -f './compress/hist.c' || echo '$(srcdir)/'`./compress/hist.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-hist.Tpo ./compress/$(DEPDIR)/libzstd_la-hist.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/hist.c' object='./compress/libzstd_la-hist.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-hist.lo `test -f './compress/hist.c' || echo '$(srcdir)/'`./compress/hist.c
+
+./compress/libzstd_la-fse_compress.lo: ./compress/fse_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-fse_compress.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-fse_compress.Tpo -c -o ./compress/libzstd_la-fse_compress.lo `test -f './compress/fse_compress.c' || echo '$(srcdir)/'`./compress/fse_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-fse_compress.Tpo ./compress/$(DEPDIR)/libzstd_la-fse_compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/fse_compress.c' object='./compress/libzstd_la-fse_compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-fse_compress.lo `test -f './compress/fse_compress.c' || echo '$(srcdir)/'`./compress/fse_compress.c
+
+./compress/libzstd_la-zstd_opt.lo: ./compress/zstd_opt.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_opt.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_opt.Tpo -c -o ./compress/libzstd_la-zstd_opt.lo `test -f './compress/zstd_opt.c' || echo '$(srcdir)/'`./compress/zstd_opt.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_opt.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_opt.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_opt.c' object='./compress/libzstd_la-zstd_opt.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_opt.lo `test -f './compress/zstd_opt.c' || echo '$(srcdir)/'`./compress/zstd_opt.c
+
+./compress/libzstd_la-zstd_compress.lo: ./compress/zstd_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_compress.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_compress.Tpo -c -o ./compress/libzstd_la-zstd_compress.lo `test -f './compress/zstd_compress.c' || echo '$(srcdir)/'`./compress/zstd_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_compress.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_compress.c' object='./compress/libzstd_la-zstd_compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_compress.lo `test -f './compress/zstd_compress.c' || echo '$(srcdir)/'`./compress/zstd_compress.c
+
+./compress/libzstd_la-huf_compress.lo: ./compress/huf_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-huf_compress.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-huf_compress.Tpo -c -o ./compress/libzstd_la-huf_compress.lo `test -f './compress/huf_compress.c' || echo '$(srcdir)/'`./compress/huf_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-huf_compress.Tpo ./compress/$(DEPDIR)/libzstd_la-huf_compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/huf_compress.c' object='./compress/libzstd_la-huf_compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-huf_compress.lo `test -f './compress/huf_compress.c' || echo '$(srcdir)/'`./compress/huf_compress.c
+
+./compress/libzstd_la-zstd_ldm.lo: ./compress/zstd_ldm.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./compress/libzstd_la-zstd_ldm.lo -MD -MP -MF ./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Tpo -c -o ./compress/libzstd_la-zstd_ldm.lo `test -f './compress/zstd_ldm.c' || echo '$(srcdir)/'`./compress/zstd_ldm.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Tpo ./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./compress/zstd_ldm.c' object='./compress/libzstd_la-zstd_ldm.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./compress/libzstd_la-zstd_ldm.lo `test -f './compress/zstd_ldm.c' || echo '$(srcdir)/'`./compress/zstd_ldm.c
+
+./common/libzstd_la-xxhash.lo: ./common/xxhash.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-xxhash.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-xxhash.Tpo -c -o ./common/libzstd_la-xxhash.lo `test -f './common/xxhash.c' || echo '$(srcdir)/'`./common/xxhash.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-xxhash.Tpo ./common/$(DEPDIR)/libzstd_la-xxhash.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/xxhash.c' object='./common/libzstd_la-xxhash.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-xxhash.lo `test -f './common/xxhash.c' || echo '$(srcdir)/'`./common/xxhash.c
+
+./common/libzstd_la-fse_decompress.lo: ./common/fse_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-fse_decompress.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-fse_decompress.Tpo -c -o ./common/libzstd_la-fse_decompress.lo `test -f './common/fse_decompress.c' || echo '$(srcdir)/'`./common/fse_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-fse_decompress.Tpo ./common/$(DEPDIR)/libzstd_la-fse_decompress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/fse_decompress.c' object='./common/libzstd_la-fse_decompress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-fse_decompress.lo `test -f './common/fse_decompress.c' || echo '$(srcdir)/'`./common/fse_decompress.c
+
+./common/libzstd_la-pool.lo: ./common/pool.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-pool.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-pool.Tpo -c -o ./common/libzstd_la-pool.lo `test -f './common/pool.c' || echo '$(srcdir)/'`./common/pool.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-pool.Tpo ./common/$(DEPDIR)/libzstd_la-pool.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/pool.c' object='./common/libzstd_la-pool.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-pool.lo `test -f './common/pool.c' || echo '$(srcdir)/'`./common/pool.c
+
+./common/libzstd_la-zstd_common.lo: ./common/zstd_common.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-zstd_common.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-zstd_common.Tpo -c -o ./common/libzstd_la-zstd_common.lo `test -f './common/zstd_common.c' || echo '$(srcdir)/'`./common/zstd_common.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-zstd_common.Tpo ./common/$(DEPDIR)/libzstd_la-zstd_common.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/zstd_common.c' object='./common/libzstd_la-zstd_common.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-zstd_common.lo `test -f './common/zstd_common.c' || echo '$(srcdir)/'`./common/zstd_common.c
+
+./common/libzstd_la-error_private.lo: ./common/error_private.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-error_private.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-error_private.Tpo -c -o ./common/libzstd_la-error_private.lo `test -f './common/error_private.c' || echo '$(srcdir)/'`./common/error_private.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-error_private.Tpo ./common/$(DEPDIR)/libzstd_la-error_private.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/error_private.c' object='./common/libzstd_la-error_private.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-error_private.lo `test -f './common/error_private.c' || echo '$(srcdir)/'`./common/error_private.c
+
+./common/libzstd_la-debug.lo: ./common/debug.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-debug.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-debug.Tpo -c -o ./common/libzstd_la-debug.lo `test -f './common/debug.c' || echo '$(srcdir)/'`./common/debug.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-debug.Tpo ./common/$(DEPDIR)/libzstd_la-debug.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/debug.c' object='./common/libzstd_la-debug.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-debug.lo `test -f './common/debug.c' || echo '$(srcdir)/'`./common/debug.c
+
+./common/libzstd_la-threading.lo: ./common/threading.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-threading.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-threading.Tpo -c -o ./common/libzstd_la-threading.lo `test -f './common/threading.c' || echo '$(srcdir)/'`./common/threading.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-threading.Tpo ./common/$(DEPDIR)/libzstd_la-threading.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/threading.c' object='./common/libzstd_la-threading.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-threading.lo `test -f './common/threading.c' || echo '$(srcdir)/'`./common/threading.c
+
+./common/libzstd_la-entropy_common.lo: ./common/entropy_common.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./common/libzstd_la-entropy_common.lo -MD -MP -MF ./common/$(DEPDIR)/libzstd_la-entropy_common.Tpo -c -o ./common/libzstd_la-entropy_common.lo `test -f './common/entropy_common.c' || echo '$(srcdir)/'`./common/entropy_common.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./common/$(DEPDIR)/libzstd_la-entropy_common.Tpo ./common/$(DEPDIR)/libzstd_la-entropy_common.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./common/entropy_common.c' object='./common/libzstd_la-entropy_common.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./common/libzstd_la-entropy_common.lo `test -f './common/entropy_common.c' || echo '$(srcdir)/'`./common/entropy_common.c
+
+./deprecated/libzstd_la-zbuff_compress.lo: ./deprecated/zbuff_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./deprecated/libzstd_la-zbuff_compress.lo -MD -MP -MF ./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Tpo -c -o ./deprecated/libzstd_la-zbuff_compress.lo `test -f './deprecated/zbuff_compress.c' || echo '$(srcdir)/'`./deprecated/zbuff_compress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Tpo ./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./deprecated/zbuff_compress.c' object='./deprecated/libzstd_la-zbuff_compress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./deprecated/libzstd_la-zbuff_compress.lo `test -f './deprecated/zbuff_compress.c' || echo '$(srcdir)/'`./deprecated/zbuff_compress.c
+
+./deprecated/libzstd_la-zbuff_decompress.lo: ./deprecated/zbuff_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./deprecated/libzstd_la-zbuff_decompress.lo -MD -MP -MF ./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Tpo -c -o ./deprecated/libzstd_la-zbuff_decompress.lo `test -f './deprecated/zbuff_decompress.c' || echo '$(srcdir)/'`./deprecated/zbuff_decompress.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Tpo ./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./deprecated/zbuff_decompress.c' object='./deprecated/libzstd_la-zbuff_decompress.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./deprecated/libzstd_la-zbuff_decompress.lo `test -f './deprecated/zbuff_decompress.c' || echo '$(srcdir)/'`./deprecated/zbuff_decompress.c
+
+./deprecated/libzstd_la-zbuff_common.lo: ./deprecated/zbuff_common.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./deprecated/libzstd_la-zbuff_common.lo -MD -MP -MF ./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Tpo -c -o ./deprecated/libzstd_la-zbuff_common.lo `test -f './deprecated/zbuff_common.c' || echo '$(srcdir)/'`./deprecated/zbuff_common.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Tpo ./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./deprecated/zbuff_common.c' object='./deprecated/libzstd_la-zbuff_common.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./deprecated/libzstd_la-zbuff_common.lo `test -f './deprecated/zbuff_common.c' || echo '$(srcdir)/'`./deprecated/zbuff_common.c
+
+./dictBuilder/libzstd_la-zdict.lo: ./dictBuilder/zdict.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./dictBuilder/libzstd_la-zdict.lo -MD -MP -MF ./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Tpo -c -o ./dictBuilder/libzstd_la-zdict.lo `test -f './dictBuilder/zdict.c' || echo '$(srcdir)/'`./dictBuilder/zdict.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Tpo ./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./dictBuilder/zdict.c' object='./dictBuilder/libzstd_la-zdict.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./dictBuilder/libzstd_la-zdict.lo `test -f './dictBuilder/zdict.c' || echo '$(srcdir)/'`./dictBuilder/zdict.c
+
+./dictBuilder/libzstd_la-divsufsort.lo: ./dictBuilder/divsufsort.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./dictBuilder/libzstd_la-divsufsort.lo -MD -MP -MF ./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Tpo -c -o ./dictBuilder/libzstd_la-divsufsort.lo `test -f './dictBuilder/divsufsort.c' || echo '$(srcdir)/'`./dictBuilder/divsufsort.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Tpo ./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./dictBuilder/divsufsort.c' object='./dictBuilder/libzstd_la-divsufsort.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./dictBuilder/libzstd_la-divsufsort.lo `test -f './dictBuilder/divsufsort.c' || echo '$(srcdir)/'`./dictBuilder/divsufsort.c
+
+./dictBuilder/libzstd_la-cover.lo: ./dictBuilder/cover.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./dictBuilder/libzstd_la-cover.lo -MD -MP -MF ./dictBuilder/$(DEPDIR)/libzstd_la-cover.Tpo -c -o ./dictBuilder/libzstd_la-cover.lo `test -f './dictBuilder/cover.c' || echo '$(srcdir)/'`./dictBuilder/cover.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./dictBuilder/$(DEPDIR)/libzstd_la-cover.Tpo ./dictBuilder/$(DEPDIR)/libzstd_la-cover.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./dictBuilder/cover.c' object='./dictBuilder/libzstd_la-cover.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./dictBuilder/libzstd_la-cover.lo `test -f './dictBuilder/cover.c' || echo '$(srcdir)/'`./dictBuilder/cover.c
+
+./legacy/libzstd_la-zstd_v03.lo: ./legacy/zstd_v03.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v03.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Tpo -c -o ./legacy/libzstd_la-zstd_v03.lo `test -f './legacy/zstd_v03.c' || echo '$(srcdir)/'`./legacy/zstd_v03.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v03.c' object='./legacy/libzstd_la-zstd_v03.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v03.lo `test -f './legacy/zstd_v03.c' || echo '$(srcdir)/'`./legacy/zstd_v03.c
+
+./legacy/libzstd_la-zstd_v07.lo: ./legacy/zstd_v07.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v07.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Tpo -c -o ./legacy/libzstd_la-zstd_v07.lo `test -f './legacy/zstd_v07.c' || echo '$(srcdir)/'`./legacy/zstd_v07.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v07.c' object='./legacy/libzstd_la-zstd_v07.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v07.lo `test -f './legacy/zstd_v07.c' || echo '$(srcdir)/'`./legacy/zstd_v07.c
+
+./legacy/libzstd_la-zstd_v06.lo: ./legacy/zstd_v06.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v06.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Tpo -c -o ./legacy/libzstd_la-zstd_v06.lo `test -f './legacy/zstd_v06.c' || echo '$(srcdir)/'`./legacy/zstd_v06.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v06.c' object='./legacy/libzstd_la-zstd_v06.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v06.lo `test -f './legacy/zstd_v06.c' || echo '$(srcdir)/'`./legacy/zstd_v06.c
+
+./legacy/libzstd_la-zstd_v01.lo: ./legacy/zstd_v01.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v01.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Tpo -c -o ./legacy/libzstd_la-zstd_v01.lo `test -f './legacy/zstd_v01.c' || echo '$(srcdir)/'`./legacy/zstd_v01.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v01.c' object='./legacy/libzstd_la-zstd_v01.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v01.lo `test -f './legacy/zstd_v01.c' || echo '$(srcdir)/'`./legacy/zstd_v01.c
+
+./legacy/libzstd_la-zstd_v02.lo: ./legacy/zstd_v02.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v02.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Tpo -c -o ./legacy/libzstd_la-zstd_v02.lo `test -f './legacy/zstd_v02.c' || echo '$(srcdir)/'`./legacy/zstd_v02.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v02.c' object='./legacy/libzstd_la-zstd_v02.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v02.lo `test -f './legacy/zstd_v02.c' || echo '$(srcdir)/'`./legacy/zstd_v02.c
+
+./legacy/libzstd_la-zstd_v04.lo: ./legacy/zstd_v04.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v04.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Tpo -c -o ./legacy/libzstd_la-zstd_v04.lo `test -f './legacy/zstd_v04.c' || echo '$(srcdir)/'`./legacy/zstd_v04.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v04.c' object='./legacy/libzstd_la-zstd_v04.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v04.lo `test -f './legacy/zstd_v04.c' || echo '$(srcdir)/'`./legacy/zstd_v04.c
+
+./legacy/libzstd_la-zstd_v05.lo: ./legacy/zstd_v05.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -MT ./legacy/libzstd_la-zstd_v05.lo -MD -MP -MF ./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Tpo -c -o ./legacy/libzstd_la-zstd_v05.lo `test -f './legacy/zstd_v05.c' || echo '$(srcdir)/'`./legacy/zstd_v05.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) ./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Tpo ./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='./legacy/zstd_v05.c' object='./legacy/libzstd_la-zstd_v05.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libzstd_la_CFLAGS) $(CFLAGS) -c -o ./legacy/libzstd_la-zstd_v05.lo `test -f './legacy/zstd_v05.c' || echo '$(srcdir)/'`./legacy/zstd_v05.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+	-rm -rf ./common/.libs ./common/_libs
+	-rm -rf ./compress/.libs ./compress/_libs
+	-rm -rf ./decompress/.libs ./decompress/_libs
+	-rm -rf ./deprecated/.libs ./deprecated/_libs
+	-rm -rf ./dictBuilder/.libs ./dictBuilder/_libs
+	-rm -rf ./legacy/.libs ./legacy/_libs
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-rm -f common/$(DEPDIR)/$(am__dirstamp)
+	-rm -f common/$(am__dirstamp)
+	-rm -f compress/$(DEPDIR)/$(am__dirstamp)
+	-rm -f compress/$(am__dirstamp)
+	-rm -f decompress/$(DEPDIR)/$(am__dirstamp)
+	-rm -f decompress/$(am__dirstamp)
+	-rm -f deprecated/$(DEPDIR)/$(am__dirstamp)
+	-rm -f deprecated/$(am__dirstamp)
+	-rm -f dictBuilder/$(DEPDIR)/$(am__dirstamp)
+	-rm -f dictBuilder/$(am__dirstamp)
+	-rm -f legacy/$(DEPDIR)/$(am__dirstamp)
+	-rm -f legacy/$(am__dirstamp)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f ./common/$(DEPDIR)/libzstd_la-debug.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-entropy_common.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-error_private.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-fse_decompress.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-pool.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-threading.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-xxhash.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-zstd_common.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-fse_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-hist.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-huf_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_fast.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_opt.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Plo
+	-rm -f ./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Plo
+	-rm -f ./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-cover.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Plo
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f ./common/$(DEPDIR)/libzstd_la-debug.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-entropy_common.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-error_private.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-fse_decompress.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-pool.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-threading.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-xxhash.Plo
+	-rm -f ./common/$(DEPDIR)/libzstd_la-zstd_common.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-fse_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-hist.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-huf_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_compress.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_double_fast.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_fast.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_lazy.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_ldm.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstd_opt.Plo
+	-rm -f ./compress/$(DEPDIR)/libzstd_la-zstdmt_compress.Plo
+	-rm -f ./decompress/$(DEPDIR)/libzstd_la-huf_decompress.Plo
+	-rm -f ./decompress/$(DEPDIR)/libzstd_la-zstd_decompress.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_common.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_compress.Plo
+	-rm -f ./deprecated/$(DEPDIR)/libzstd_la-zbuff_decompress.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-cover.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-divsufsort.Plo
+	-rm -f ./dictBuilder/$(DEPDIR)/libzstd_la-zdict.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v01.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v02.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v03.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v04.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v05.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v06.Plo
+	-rm -f ./legacy/$(DEPDIR)/libzstd_la-zstd_v07.Plo
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-libLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libLTLIBRARIES clean-libtool cscopelist-am \
+	ctags ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am \
+	install-includeHEADERS install-info install-info-am \
+	install-libLTLIBRARIES install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am uninstall-includeHEADERS \
+	uninstall-libLTLIBRARIES
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/deps/SZ/zstd/README.md b/deps/SZ/zstd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..75debe872f93fb19aca0f4367f88d76fbf7ccaf8
--- /dev/null
+++ b/deps/SZ/zstd/README.md
@@ -0,0 +1,119 @@
+Zstandard library files
+================================
+
+The __lib__ directory is split into several sub-directories,
+in order to make it easier to select or exclude features.
+
+
+#### Building
+
+`Makefile` script is provided, supporting all standard [Makefile conventions](https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html#Makefile-Conventions),
+including commands variables, staged install, directory variables and standard targets.
+- `make` : generates both static and dynamic libraries
+- `make install` : install libraries in default system directories
+
+`libzstd` default scope includes compression, decompression, dictionary building,
+and decoding support for legacy formats >= v0.4.0.
+
+
+#### API
+
+Zstandard's stable API is exposed within [lib/zstd.h](zstd.h).
+
+
+#### Advanced API
+
+Optional advanced features are exposed via :
+
+- `lib/common/zstd_errors.h` : translates `size_t` function results
+                              into an `ZSTD_ErrorCode`, for accurate error handling.
+- `ZSTD_STATIC_LINKING_ONLY` : if this macro is defined _before_ including `zstd.h`,
+                          it unlocks access to advanced experimental API,
+                          exposed in second part of `zstd.h`.
+                          These APIs are not "stable", their definition may change in the future.
+                          As a consequence, it shall ___never be used with dynamic library___ !
+                          Only static linking is allowed.
+
+
+#### Modular build
+
+It's possible to compile only a limited set of features.
+
+- Directory `lib/common` is always required, for all variants.
+- Compression source code lies in `lib/compress`
+- Decompression source code lies in `lib/decompress`
+- It's possible to include only `compress` or only `decompress`, they don't depend on each other.
+- `lib/dictBuilder` : makes it possible to generate dictionaries from a set of samples.
+        The API is exposed in `lib/dictBuilder/zdict.h`.
+        This module depends on both `lib/common` and `lib/compress` .
+- `lib/legacy` : source code to decompress legacy zstd formats, starting from `v0.1.0`.
+        This module depends on `lib/common` and `lib/decompress`.
+        To enable this feature, it's required to define `ZSTD_LEGACY_SUPPORT` during compilation.
+        Typically, with `gcc`, add argument `-DZSTD_LEGACY_SUPPORT=1`.
+        Using higher number limits versions supported.
+        For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats >= v0.2.0".
+        `ZSTD_LEGACY_SUPPORT=3` means : "support legacy formats >= v0.3.0", and so on.
+        Starting v0.8.0, all versions of `zstd` produce frames compliant with specification.
+        As a consequence, `ZSTD_LEGACY_SUPPORT=8` (or more) doesn't trigger legacy support.
+        Also, `ZSTD_LEGACY_SUPPORT=0` means "do __not__ support legacy formats".
+        Once enabled, this capability is transparently triggered within decompression functions.
+        It's also possible to invoke directly legacy API, as exposed in `lib/legacy/zstd_legacy.h`.
+        Each version also provides an additional dedicated set of advanced API.
+        For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
+        Note : `lib/legacy` only supports _decoding_ legacy formats.
+- Similarly, you can define `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`, 
+        and `ZSTD_LIB_DEPRECATED` as 0 to forgo compilation of the corresponding features. This will 
+        also disable compilation of all dependencies (eg. `ZSTD_LIB_COMPRESSION=0` will also disable
+        dictBuilder). 
+
+
+#### Multithreading support
+
+Multithreading is disabled by default when building with `make`.
+Enabling multithreading requires 2 conditions :
+- set macro `ZSTD_MULTITHREAD`
+- on POSIX systems : compile with pthread (`-pthread` compilation flag for `gcc`)
+
+Both conditions are automatically triggered by invoking `make lib-mt` target.
+Note that, when linking a POSIX program with a multithreaded version of `libzstd`,
+it's necessary to trigger `-pthread` flag during link stage.
+
+Multithreading capabilities are exposed
+via [advanced API `ZSTD_compress_generic()` defined in `lib/zstd.h`](https://github.com/facebook/zstd/blob/dev/lib/zstd.h#L919).
+This API is still considered experimental,
+but is expected to become "stable" at some point in the future.
+
+
+#### Windows : using MinGW+MSYS to create DLL
+
+DLL can be created using MinGW+MSYS with the `make libzstd` command.
+This command creates `dll\libzstd.dll` and the import library `dll\libzstd.lib`.
+The import library is only required with Visual C++.
+The header file `zstd.h` and the dynamic library `dll\libzstd.dll` are required to
+compile a project using gcc/MinGW.
+The dynamic library has to be added to linking options.
+It means that if a project that uses ZSTD consists of a single `test-dll.c`
+file it should be linked with `dll\libzstd.dll`. For example:
+```
+    gcc $(CFLAGS) -Iinclude/ test-dll.c -o test-dll dll\libzstd.dll
+```
+The compiled executable will require ZSTD DLL which is available at `dll\libzstd.dll`.
+
+
+#### Deprecated API
+
+Obsolete API on their way out are stored in directory `lib/deprecated`.
+At this stage, it contains older streaming prototypes, in `lib/deprecated/zbuff.h`.
+These prototypes will be removed in some future version.
+Consider migrating code towards supported streaming API exposed in `zstd.h`.
+
+
+#### Miscellaneous
+
+The other files are not source code. There are :
+
+ - `LICENSE` : contains the BSD license text
+ - `Makefile` : `make` script to build and install zstd library (static and dynamic)
+ - `BUCK` : support for `buck` build system (https://buckbuild.com/)
+ - `libzstd.pc.in` : for `pkg-config` (used in `make install`)
+ - `README.md` : this file
diff --git a/deps/SZ/zstd/common/bitstream.h b/deps/SZ/zstd/common/bitstream.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f91460c5eb0a0e319c8f0b91e4f8e041ff59da5
--- /dev/null
+++ b/deps/SZ/zstd/common/bitstream.h
@@ -0,0 +1,458 @@
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+#include "error_private.h"  /* error codes and messages */
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        _BitScanReverse ( &r, val );
+        return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+        return 31 - __builtin_clz (val);
+#   else   /* Software version */
+        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+                                                 11, 14, 16, 18, 22, 25,  3, 30,
+                                                  8, 12, 20, 28, 15, 17, 24,  7,
+                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    assert(bitC->ptr <= bitC->endPtr);
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                /* fall-through */
+
+        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                /* fall-through */
+
+        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                /* fall-through */
+
+        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+                /* fall-through */
+
+        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+                /* fall-through */
+
+        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                /* fall-through */
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008  /* experimental */
+#  if defined(__x86_64__)
+    if (sizeof(bitContainer)==8)
+        return _bextr_u64(bitContainer, start, nbBits);
+    else
+#  endif
+        return _bextr_u32(bitContainer, start, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return (bitContainer >> start) & BIT_mask[nbBits];
+#endif
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/deps/SZ/zstd/common/compiler.h b/deps/SZ/zstd/common/compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..366ed2b4b4fe00595a945bad50a2f88dae9e4682
--- /dev/null
+++ b/deps/SZ/zstd/common/compiler.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to elimininate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  ifdef __GNUC__
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/* prefetch */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#else
+#  define PREFETCH(ptr)   /* disabled */
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+#endif /* ZSTD_COMPILER_H */
diff --git a/deps/SZ/zstd/common/cpu.h b/deps/SZ/zstd/common/cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..88e0ebf44f86207afe6ddd688f9cb2dea714f14e
--- /dev/null
+++ b/deps/SZ/zstd/common/cpu.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#ifdef _MSC_VER
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1));
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\r"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
diff --git a/deps/SZ/zstd/common/debug.c b/deps/SZ/zstd/common/debug.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ebdd1cb15a6288e871f361f5d3fbbd8835d1d90
--- /dev/null
+++ b/deps/SZ/zstd/common/debug.c
@@ -0,0 +1,44 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+#include "debug.h"
+
+int g_debuglevel = DEBUGLEVEL;
diff --git a/deps/SZ/zstd/common/debug.h b/deps/SZ/zstd/common/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c04ad2cc98c4b949035141c56a6385ed9797012
--- /dev/null
+++ b/deps/SZ/zstd/common/debug.h
@@ -0,0 +1,123 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact,
+ * but can only work with compile-time constants.
+ * This variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : no debug, all run-time functions disabled
+ * 1 : no display, enables assert() only
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively enable higher verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  include <stdio.h>
+extern int g_debuglevel; /* here, this variable is only declared,
+                           it actually lives in debug.c,
+                           and is shared by the whole process.
+                           It's typically used to enable very verbose levels
+                           on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                      \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
diff --git a/deps/SZ/zstd/common/entropy_common.c b/deps/SZ/zstd/common/entropy_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..b12944e1de93ad10a0349fd9075bb0258128697c
--- /dev/null
+++ b/deps/SZ/zstd/common/entropy_common.c
@@ -0,0 +1,236 @@
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+#include "huf.h"
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) {
+        /* This function only works when hbSize >= 4 */
+        char buffer[4];
+        memset(buffer, 0, sizeof(buffer));
+        memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 4);
+
+    /* init */
+    memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) & (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0 += 24;
+                if (ip < iend-5) {
+                    ip += 2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount   += 16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0 += 3;
+                bitStream >>= 2;
+                bitCount += 2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            } else {
+                bitStream >>= 2;
+        }   }
+        {   int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= count < 0 ? -count : count;   /* -1 means +1 */
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(corruption_detected);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BIT_highbit32(rest);
+            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
diff --git a/deps/SZ/zstd/common/error_private.c b/deps/SZ/zstd/common/error_private.c
new file mode 100644
index 0000000000000000000000000000000000000000..d004ee636c67971408ad81d457ac80d445c6097b
--- /dev/null
+++ b/deps/SZ/zstd/common/error_private.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+}
diff --git a/deps/SZ/zstd/common/error_private.h b/deps/SZ/zstd/common/error_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2fa7e34b01459d3f5afa858ae133cfdd9db144
--- /dev/null
+++ b/deps/SZ/zstd/common/error_private.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>        /* size_t */
+#include "zstd_errors.h"  /* enum list */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
diff --git a/deps/SZ/zstd/common/fse.h b/deps/SZ/zstd/common/fse.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5a6b6d4db70062053511bcc3a278a1b3d9b7287
--- /dev/null
+++ b/deps/SZ/zstd/common/fse.h
@@ -0,0 +1,708 @@
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[] (see hist.h)
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+ *  FSE advanced API
+ ***************************************** */
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/deps/SZ/zstd/common/fse_decompress.c b/deps/SZ/zstd/common/fse_decompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..72bbead5beea3d5e73e5b4eaaa689ee8b98533d1
--- /dev/null
+++ b/deps/SZ/zstd/common/fse_decompress.c
@@ -0,0 +1,309 @@
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+    /* normal FSE decoding mode */
+    size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(NCountLength)) return NCountLength;
+    //if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+    if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+    ip += NCountLength;
+    cSrcSize -= NCountLength;
+
+    CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+    return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace);   /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/deps/SZ/zstd/common/huf.h b/deps/SZ/zstd/common/huf.h
new file mode 100644
index 0000000000000000000000000000000000000000..de94641110641e094d741517e0676b291f7194e0
--- /dev/null
+++ b/deps/SZ/zstd/common/huf.h
@@ -0,0 +1,334 @@
+/* ******************************************************************
+   huff0 huffman codec,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <stddef.h>    /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE (6 << 10)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);
+
+#endif   /* HUF_H_298734234 */
+
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of a dynamic library,
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+#include "mem.h"   /* U32 */
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const U32* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/** HUF_getNbBits() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private
+ *  Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/deps/SZ/zstd/common/mem.h b/deps/SZ/zstd/common/mem.h
new file mode 100644
index 0000000000000000000000000000000000000000..47d2300177c0a1e618f95890b51077f2ab0a5c9c
--- /dev/null
+++ b/deps/SZ/zstd/common/mem.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef   uint8_t BYTE;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || defined(__GNUC__)
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+    __pragma( pack(push, 1) )
+    typedef struct { U16 v; } unalign16;
+    typedef struct { U32 v; } unalign32;
+    typedef struct { U64 v; } unalign64;
+    typedef struct { size_t v; } unalignArch;
+    __pragma( pack(pop) )
+#else
+    typedef struct { U16 v; } __attribute__((packed)) unalign16;
+    typedef struct { U32 v; } __attribute__((packed)) unalign32;
+    typedef struct { U64 v; } __attribute__((packed)) unalign64;
+    typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
diff --git a/deps/SZ/zstd/common/pool.c b/deps/SZ/zstd/common/pool.c
new file mode 100644
index 0000000000000000000000000000000000000000..281b3824ac4dae59a993be1cc673fdf838466b55
--- /dev/null
+++ b/deps/SZ/zstd/common/pool.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>    /* size_t */
+#include "debug.h"     /* assert */
+#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
+#include "pool.h"
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = ctx->queueHead == ctx->queueTail;
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
+            if (ctx->queueSize == 1) {
+                ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            }
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
+    }  /* for (;;) */
+    assert(0);  /* Unreachable */
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem) {
+    POOL_ctx* ctx;
+    /* Check parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threadCapacity = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->threadCapacity = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);  /* note : could fail */
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_free(ctx->queue, ctx->customMem);
+    ZSTD_free(ctx->threads, ctx->customMem);
+    ZSTD_free(ctx, ctx->customMem);
+}
+
+
+
+size_t POOL_sizeof(POOL_ctx *ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_free(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_ctx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_ctx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_ctx);
+    (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_ctx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
diff --git a/deps/SZ/zstd/common/pool.h b/deps/SZ/zstd/common/pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..458d37f13c3e805d0705871d6f762c6bf4a5263d
--- /dev/null
+++ b/deps/SZ/zstd/common/pool.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h>   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "zstd.h"
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
+
+/*! POOL_free() :
+ *  Free a thread pool returned by POOL_create().
+ */
+void POOL_free(POOL_ctx* ctx);
+
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
+/*! POOL_sizeof() :
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
+size_t POOL_sizeof(POOL_ctx* ctx);
+
+/*! POOL_function :
+ *  The function type that can be added to a thread pool.
+ */
+typedef void (*POOL_function)(void*);
+
+/*! POOL_add() :
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+ *  Add the job `function(opaque)` to thread pool _if_ a worker is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/deps/SZ/zstd/common/threading.c b/deps/SZ/zstd/common/threading.c
new file mode 100644
index 0000000000000000000000000000000000000000..8be8c8da948a86fe9e7e0af9b6110fea277c1f2a
--- /dev/null
+++ b/deps/SZ/zstd/common/threading.c
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+/* create fake symbol to avoid empty trnaslation unit warning */
+int g_ZSTD_threading_useles_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+#include "threading.h"
+
+
+/* ===  Implementation  === */
+
+static unsigned __stdcall worker(void *arg)
+{
+    ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg;
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    (void)unused;
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL);
+
+    if (!thread->handle)
+        return errno;
+    else
+        return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
+{
+    DWORD result;
+
+    if (!thread.handle) return 0;
+
+    result = WaitForSingleObject(thread.handle, INFINITE);
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (value_ptr) *value_ptr = thread.arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
diff --git a/deps/SZ/zstd/common/threading.h b/deps/SZ/zstd/common/threading.h
new file mode 100644
index 0000000000000000000000000000000000000000..d806c89d01c9cc907f9c0aad946064c9fd82b1b4
--- /dev/null
+++ b/deps/SZ/zstd/common/threading.h
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)   /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/deps/SZ/zstd/common/xxhash.c b/deps/SZ/zstd/common/xxhash.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d9c0e963cbf5f09bc9f91bdd33ae5f75287c7cc
--- /dev/null
+++ b/deps/SZ/zstd/common/xxhash.c
@@ -0,0 +1,875 @@
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY
+#endif
+#include "xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+
+
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+    typedef  int32_t S32;
+    typedef uint64_t U64;
+#  else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+    typedef   signed int       S32;
+    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+#  endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int g_one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/* *************************************
+*  Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+*  Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+*  Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_CREATESTATE_STATIC(state);
+    XXH32_reset(state, seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_CREATESTATE_STATIC(state);
+    XXH64_reset(state, seed);
+    XXH64_update(state, input, len);
+    return XXH64_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len_32 += (unsigned)len;
+    state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (unsigned)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* some data left from previous update */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {   const U32* p32 = state->mem32;
+            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do {
+            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) {  /* fill in tmp buffer */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* tmp buffer is full */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do {
+            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+*  Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
diff --git a/deps/SZ/zstd/common/xxhash.h b/deps/SZ/zstd/common/xxhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bad1f59f63a93308341a1348efa9fbcaddbaa7c
--- /dev/null
+++ b/deps/SZ/zstd/common/xxhash.h
@@ -0,0 +1,305 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+*  API modifier
+******************************/
+/** XXH_PRIVATE_API
+*   This is useful if you want to include xxhash functions in `static` mode
+*   in order to inline them, and remove their symbol from the public list.
+*   Methodology :
+*     #define XXH_PRIVATE_API
+*     #include "xxhash.h"
+*   `xxhash.c` is automatically included.
+*   It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Simple Hash Functions
+******************************/
+typedef unsigned int       XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+*  Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+*  Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* ! C99 */
+#  define restrict   /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+*  Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+*  These functions allow transformation of hash result into and from its canonical format.
+*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+   This section contains definitions which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   They shall only be used with static linking.
+   Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+   statically, on stack, or in a struct for example.
+   Do not use members directly. */
+
+   struct XXH32_state_s {
+       unsigned total_len_32;
+       unsigned large_len;
+       unsigned v1;
+       unsigned v2;
+       unsigned v3;
+       unsigned v4;
+       unsigned mem32[4];   /* buffer defined as U32 for alignment */
+       unsigned memsize;
+       unsigned reserved;   /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH32_state_t */
+
+   struct XXH64_state_s {
+       unsigned long long total_len;
+       unsigned long long v1;
+       unsigned long long v2;
+       unsigned long long v3;
+       unsigned long long v4;
+       unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+       unsigned memsize;
+       unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH64_state_t */
+
+
+#  ifdef XXH_PRIVATE_API
+#    include "xxhash.c"   /* include xxhash functions as `static`, for inlining */
+#  endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/deps/SZ/zstd/common/zstd_common.c b/deps/SZ/zstd/common/zstd_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..6f05d240e43cfdbec4009a916a7c6710c2418b30
--- /dev/null
+++ b/deps/SZ/zstd/common/zstd_common.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>      /* malloc, calloc, free */
+#include <string.h>      /* memset */
+#include "error_private.h"
+#include "zstd_internal.h"
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTD_isError() :
+ *  tells if a return value is an error code */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ *  provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ *  provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return malloc(size);
+}
+
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        memset(ptr, 0, size);
+        return ptr;
+    }
+    return calloc(1, size);
+}
+
+void ZSTD_free(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            free(ptr);
+    }
+}
diff --git a/deps/SZ/zstd/common/zstd_errors.h b/deps/SZ/zstd/common/zstd_errors.h
new file mode 100644
index 0000000000000000000000000000000000000000..57533f28696b38626162f8a22d3e609485eda8ba
--- /dev/null
+++ b/deps/SZ/zstd/common/zstd_errors.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/deps/SZ/zstd/common/zstd_internal.h b/deps/SZ/zstd/common/zstd_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4c1af53f9b1fbc4a487574a6f10f3f47a2e298e
--- /dev/null
+++ b/deps/SZ/zstd/common/zstd_internal.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "compiler.h"
+#include "mem.h"
+#include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+#include "xxhash.h"                /* XXH_reset, update, digest */
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; }  /* check and Forward error code */
+#define CHECK_E(f, e) { size_t const errcod = f; if (ERR_isError(errcod)) return ERROR(e); }  /* check and send Error code */
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+#define ZSTD_WINDOWLOG_DEFAULTMAX 27 /* Default maximum allowed window log */
+static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4
+static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2,
+                                             2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2,
+                                             2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2,
+                                             2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[DefaultMaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2,
+                                                     2, 1, 1, 1, 1, 1, 1, 1,
+                                                     1, 1, 1, 1, 1, 1, 1, 1,
+                                                    -1,-1,-1,-1,-1 };
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy() :
+ *  custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd)   /* should be faster for decoding, but strangely, not verified on all platform */
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = (BYTE*)dstEnd;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offset;
+    U16 litLength;
+    U16 matchLength;
+} seqDef;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* llCode;
+    BYTE* mlCode;
+    BYTE* ofCode;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+} seqStore_t;
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+/* custom memory allocation functions */
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void* ptr, ZSTD_customMem customMem);
+
+
+MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        _BitScanReverse(&r, val);
+        return (unsigned)r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
+        return 31 - __builtin_clz(val);
+#   else   /* Software version */
+        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/deps/SZ/zstd/compress/fse_compress.c b/deps/SZ/zstd/compress/fse_compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..07b3ab89bd7c02e573377d2952cd8fbd9ed79227
--- /dev/null
+++ b/deps/SZ/zstd/compress/fse_compress.c
@@ -0,0 +1,714 @@
+/* ******************************************************************
+   FSE : Finite State Entropy encoder
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include "compiler.h"
+#include "mem.h"        /* U32, U16, etc. */
+#include "debug.h"      /* assert, DEBUGLOG */
+#include "hist.h"       /* HIST_count_wksp */
+#include "bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+    U32 highThreshold = tableSize-1;
+
+    /* CTable header */
+    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for the threshold strategy to work */
+
+    /* For explanations on how to distribute symbol values over the table :
+    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u<=maxSymbolValue+1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+        }   }
+        cumul[maxSymbolValue+1] = tableSize+1;
+    }
+
+    /* Spread symbols */
+    {   U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            int nbOccurences;
+            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                break;
+            default :
+                {
+                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+    }   }   }   }
+
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+        }
+    }
+#endif
+
+    return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];   /* memset() is not necessary, even if static analyzer complain about it */
+    return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+
+/*-**************************************************************
+*  FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                                       unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    bitStream = 0;
+    bitCount  = 0;
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while (remaining>1) {  /* stops at 1 */
+        if (previous0) {
+            unsigned start = charnum;
+            while (!normalizedCounter[charnum]) charnum++;
+            while (charnum >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (charnum >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (charnum-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[charnum++];
+            int const max = (2*threshold-1)-remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previous0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+    FSE_CTable is a variable size structure which contains :
+    `U16 tableLog;`
+    `U16 maxSymbolValue;`
+    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32   count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/deps/SZ/zstd/compress/hist.c b/deps/SZ/zstd/compress/hist.c
new file mode 100644
index 0000000000000000000000000000000000000000..16524756b8dc987f92883c1c1c560efa513e41f6
--- /dev/null
+++ b/deps/SZ/zstd/compress/hist.c
@@ -0,0 +1,195 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "mem.h"             /* U32, BYTE, etc. */
+#include "debug.h"           /* assert, DEBUGLOG */
+#include "error_private.h"   /* ERROR */
+#include "hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax,
+                                unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
diff --git a/deps/SZ/zstd/compress/hist.h b/deps/SZ/zstd/compress/hist.h
new file mode 100644
index 0000000000000000000000000000000000000000..788470da7f732c53f6bfe1a361b4224d73a780fa
--- /dev/null
+++ b/deps/SZ/zstd/compress/hist.h
@@ -0,0 +1,92 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include <stddef.h>   /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ *  'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ *  @return : count of the most frequent symbol (which isn't identified).
+ *            or an error code, which can be tested using HIST_isError().
+ *            note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /*< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       unsigned* workSpace);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           unsigned* workSpace);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
diff --git a/deps/SZ/zstd/compress/huf_compress.c b/deps/SZ/zstd/compress/huf_compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..9cdaa5d796f2e55e9cc8ed6d0556d0c8cab9adfd
--- /dev/null
+++ b/deps/SZ/zstd/compress/huf_compress.c
@@ -0,0 +1,796 @@
+/* ******************************************************************
+   Huffman encoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "compiler.h"
+#include "bitstream.h"
+#include "hist.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+    U32 count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+
+struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+};   /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
+{
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+    /* attempt weights compression by FSE */
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+    }
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+    return readSize;
+}
+
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+{
+    const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    return table[symbolValue].nbBits;
+}
+
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    if (largestBits <= maxNbBits) return largestBits;   /* early exit : no elt > maxNbBits */
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }  /* n stops at huffNode[n].nbBits <= maxNbBits */
+        while (huffNode[n].nbBits == maxNbBits) n--;   /* n end at index of smallest symbol using < maxNbBits */
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost is necessarily a multiple of baseCost */
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+            int pos;
+
+            /* Get pos of last (smallest) symbol per rank */
+            memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = maxNbBits;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+                    rankLast[maxNbBits-currentNbBits] = pos;
+            }   }
+
+            while (totalCost > 0) {
+                U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease ++;
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   /* this rank is no longer empty */
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+            }   }   /* while (totalCost > 0) */
+
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    rankLast[1] = n+1;
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+    }   }   }   /* there are several too large elements (at least >= 2) */
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 r = BIT_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 const c = count[n];
+        U32 const r = BIT_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) {
+            huffNode[pos] = huffNode[pos-1];
+            pos--;
+        }
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+    nodeElt* const huffNode0 = (nodeElt*)workSpace;
+    nodeElt* const huffNode = huffNode0+1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    /* safety checks */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    /* init for parents */
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    /* enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    /* fill result into tree (val, nbBits) */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+        U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+        if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        /* determine stating value per rank */
+        {   U16 min = 0;
+            for (n=maxNbBits; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    huffNodeTable nodeTable;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += CTable[s].nbBits * count[s];
+    }
+    return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+  }
+  return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    size_t n;
+    BIT_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = BIT_initCStream(&bitC, op, oend-op);
+      if (HUF_isError(initErr)) return 0; }
+
+    n = srcSize & ~3;  /* join to mod 4 */
+    switch (srcSize & 3)
+    {
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 HUF_FLUSHBITS_2(&bitC);
+		 /* fall-through */
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 HUF_FLUSHBITS_1(&bitC);
+		 /* fall-through */
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 HUF_FLUSHBITS(&bitC);
+		 /* fall-through */
+        case 0 : /* fall-through */
+        default: break;
+    }
+
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        HUF_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        HUF_FLUSHBITS(&bitC);
+    }
+
+    return BIT_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) );
+        if (cSize==0) return 0;
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                unsigned singleStream, const HUF_CElt* CTable, const int bmi2)
+{
+    size_t const cSize = singleStream ?
+                         HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return op-ostart;
+}
+
+typedef struct {
+    U32 count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    huffNodeTable nodeTable;
+} HUF_compress_tables_t;
+
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+static size_t HUF_compress_internal (
+                void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog,
+                unsigned singleStream,
+                void* workSpace, size_t wkspSize,
+                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                const int bmi2)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    /* checks & inits */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp(table->CTable, table->count,
+                                                maxSymbolValue, huffLog,
+                                                table->nodeTable, sizeof(table->nodeTable)) );
+        huffLog = (U32)maxBits;
+        /* Zero unused symbols in CTable, so we can check it for validity */
+        memset(table->CTable + (maxSymbolValue + 1), 0,
+               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   singleStream, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       singleStream, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /*4 streams*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /* 4 streams */,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
diff --git a/deps/SZ/zstd/compress/zstd_compress.c b/deps/SZ/zstd/compress/zstd_compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..7592bf623d581334ae1f01c099d42370bda770ba
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_compress.c
@@ -0,0 +1,3900 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <string.h>         /* memset */
+#include "cpu.h"
+#include "mem.h"
+#include "hist.h"           /* HIST_countFast_wksp */
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_compress_internal.h"
+#include "zstd_fast.h"
+#include "zstd_double_fast.h"
+#include "zstd_lazy.h"
+#include "zstd_opt.h"
+#include "zstd_ldm.h"
+
+void showme(){
+    printf(" hello show me.");
+}
+
+/*-*************************************
+*  Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) {
+    return ZSTD_COMPRESSBOUND(srcSize);
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictContentSize;
+    void* workspace;
+    size_t workspaceSize;
+    ZSTD_matchState_t matchState;
+    ZSTD_compressedBlockState_t cBlockState;
+    ZSTD_compressionParameters cParams;
+    ZSTD_customMem customMem;
+    U32 dictID;
+};  /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    {   size_t const err = ZSTD_CCtx_resetParameters(cctx);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_STATIC_ASSERT(zcss_init==0);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+        if (!cctx) return NULL;
+        ZSTD_initCCtx(cctx, customMem);
+        return cctx;
+    }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*) workspace;
+    if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL;  /* minimum size */
+    if ((size_t)workspace & 7) return NULL;  /* must be 8-aligned */
+    memset(workspace, 0, workspaceSize);   /* may be a bit generous, could memset be smaller ? */
+    cctx->staticSize = workspaceSize;
+    cctx->workSpace = (void*)(cctx+1);
+    cctx->workSpaceSize = workspaceSize - sizeof(ZSTD_CCtx);
+
+    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+    if (cctx->workSpaceSize < HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t)) return NULL;
+    assert(((size_t)cctx->workSpace & (sizeof(void*)-1)) == 0);   /* ensure correct alignment */
+    cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)cctx->workSpace;
+    cctx->blockState.nextCBlock = cctx->blockState.prevCBlock + 1;
+    {
+        void* const ptr = cctx->blockState.nextCBlock + 1;
+        cctx->entropyWorkspace = (U32*)ptr;
+    }
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    return cctx;
+}
+
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+{
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
+    ZSTD_free(cctx->workSpace, cctx->customMem); cctx->workSpace = NULL;
+    ZSTD_freeCDict(cctx->cdictLocal); cctx->cdictLocal = NULL;
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    ZSTD_freeCCtxContent(cctx);
+    ZSTD_free(cctx, cctx->customMem);
+    return 0;
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+    (void) cctx;
+    return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*cctx) + cctx->workSpaceSize
+           + ZSTD_sizeof_CDict(cctx->cdictLocal)
+           + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    return ZSTD_sizeof_CCtx(zcs);  /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+        ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params cctxParams;
+    memset(&cctxParams, 0, sizeof(cctxParams));
+    cctxParams.cParams = cParams;
+    cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;  /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(cParams));
+    cctxParams.fParams.contentSizeFlag = 1;
+    return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params* params;
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    params = (ZSTD_CCtx_params*)ZSTD_calloc(
+            sizeof(ZSTD_CCtx_params), customMem);
+    if (!params) { return NULL; }
+    params->customMem = customMem;
+    params->compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    params->fParams.contentSizeFlag = 1;
+    return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+    return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+    if (params == NULL) { return 0; }
+    ZSTD_free(params, params->customMem);
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+    return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+    if (!cctxParams) { return ERROR(GENERIC); }
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->fParams.contentSizeFlag = 1;
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+    if (!cctxParams) { return ERROR(GENERIC); }
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->cParams = params.cParams;
+    cctxParams->fParams = params.fParams;
+    cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(params.cParams));
+    return 0;
+}
+
+/* ZSTD_assignParamsToCCtxParams() :
+ * params is presumed valid at this stage */
+static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams(
+        ZSTD_CCtx_params cctxParams, ZSTD_parameters params)
+{
+    ZSTD_CCtx_params ret = cctxParams;
+    ret.cParams = params.cParams;
+    ret.fParams = params.fParams;
+    ret.compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(params.cParams));
+    return ret;
+}
+
+#define CLAMPCHECK(val,min,max) {            \
+    if (((val)<(min)) | ((val)>(max))) {     \
+        return ERROR(parameter_outOfBound);  \
+}   }
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+    switch(param)
+    {
+    case ZSTD_p_compressionLevel:
+    case ZSTD_p_hashLog:
+    case ZSTD_p_chainLog:
+    case ZSTD_p_searchLog:
+    case ZSTD_p_minMatch:
+    case ZSTD_p_targetLength:
+    case ZSTD_p_compressionStrategy:
+        return 1;
+
+    case ZSTD_p_format:
+    case ZSTD_p_windowLog:
+    case ZSTD_p_contentSizeFlag:
+    case ZSTD_p_checksumFlag:
+    case ZSTD_p_dictIDFlag:
+    case ZSTD_p_forceMaxWindow :
+    case ZSTD_p_nbWorkers:
+    case ZSTD_p_jobSize:
+    case ZSTD_p_overlapSizeLog:
+    case ZSTD_p_enableLongDistanceMatching:
+    case ZSTD_p_ldmHashLog:
+    case ZSTD_p_ldmMinMatch:
+    case ZSTD_p_ldmBucketSizeLog:
+    case ZSTD_p_ldmHashEveryLog:
+    case ZSTD_p_forceAttachDict:
+    default:
+        return 0;
+    }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParameter (%u, %u)", (U32)param, value);
+    if (cctx->streamStage != zcss_init) {
+        if (ZSTD_isUpdateAuthorized(param)) {
+            cctx->cParamsChanged = 1;
+        } else {
+            return ERROR(stage_wrong);
+    }   }
+
+    switch(param)
+    {
+    case ZSTD_p_format :
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_compressionLevel:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_windowLog:
+    case ZSTD_p_hashLog:
+    case ZSTD_p_chainLog:
+    case ZSTD_p_searchLog:
+    case ZSTD_p_minMatch:
+    case ZSTD_p_targetLength:
+    case ZSTD_p_compressionStrategy:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_contentSizeFlag:
+    case ZSTD_p_checksumFlag:
+    case ZSTD_p_dictIDFlag:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_forceMaxWindow :  /* Force back-references to remain < windowSize,
+                                   * even when referencing into Dictionary content.
+                                   * default : 0 when using a CDict, 1 when using a Prefix */
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_forceAttachDict:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_nbWorkers:
+        if ((value>0) && cctx->staticSize) {
+            return ERROR(parameter_unsupported);  /* MT not compatible with static alloc */
+        }
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_jobSize:
+    case ZSTD_p_overlapSizeLog:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_enableLongDistanceMatching:
+    case ZSTD_p_ldmHashLog:
+    case ZSTD_p_ldmMinMatch:
+    case ZSTD_p_ldmBucketSizeLog:
+    case ZSTD_p_ldmHashEveryLog:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    default: return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTD_CCtxParam_setParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned value)
+{
+    DEBUGLOG(4, "ZSTD_CCtxParam_setParameter (%u, %u)", (U32)param, value);
+    switch(param)
+    {
+    case ZSTD_p_format :
+        if (value > (unsigned)ZSTD_f_zstd1_magicless)
+            return ERROR(parameter_unsupported);
+        CCtxParams->format = (ZSTD_format_e)value;
+        return (size_t)CCtxParams->format;
+
+    case ZSTD_p_compressionLevel : {
+        int cLevel = (int)value;  /* cast expected to restore negative sign */
+        if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
+        if (cLevel) {  /* 0 : does not change current level */
+            CCtxParams->compressionLevel = cLevel;
+        }
+        if (CCtxParams->compressionLevel >= 0) return CCtxParams->compressionLevel;
+        return 0;  /* return type (size_t) cannot represent negative values */
+    }
+
+    case ZSTD_p_windowLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+        CCtxParams->cParams.windowLog = value;
+        return CCtxParams->cParams.windowLog;
+
+    case ZSTD_p_hashLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+        CCtxParams->cParams.hashLog = value;
+        return CCtxParams->cParams.hashLog;
+
+    case ZSTD_p_chainLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+        CCtxParams->cParams.chainLog = value;
+        return CCtxParams->cParams.chainLog;
+
+    case ZSTD_p_searchLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+        CCtxParams->cParams.searchLog = value;
+        return value;
+
+    case ZSTD_p_minMatch :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+        CCtxParams->cParams.searchLength = value;
+        return CCtxParams->cParams.searchLength;
+
+    case ZSTD_p_targetLength :
+        /* all values are valid. 0 => use default */
+        CCtxParams->cParams.targetLength = value;
+        return CCtxParams->cParams.targetLength;
+
+    case ZSTD_p_compressionStrategy :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, (unsigned)ZSTD_fast, (unsigned)ZSTD_btultra);
+        CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+        return (size_t)CCtxParams->cParams.strategy;
+
+    case ZSTD_p_contentSizeFlag :
+        /* Content size written in frame header _when known_ (default:1) */
+        DEBUGLOG(4, "set content size flag = %u", (value>0));
+        CCtxParams->fParams.contentSizeFlag = value > 0;
+        return CCtxParams->fParams.contentSizeFlag;
+
+    case ZSTD_p_checksumFlag :
+        /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+        CCtxParams->fParams.checksumFlag = value > 0;
+        return CCtxParams->fParams.checksumFlag;
+
+    case ZSTD_p_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+        DEBUGLOG(4, "set dictIDFlag = %u", (value>0));
+        CCtxParams->fParams.noDictIDFlag = !value;
+        return !CCtxParams->fParams.noDictIDFlag;
+
+    case ZSTD_p_forceMaxWindow :
+        CCtxParams->forceWindow = (value > 0);
+        return CCtxParams->forceWindow;
+
+    case ZSTD_p_forceAttachDict :
+        CCtxParams->attachDictPref = value ?
+                                    (value > 0 ? ZSTD_dictForceAttach : ZSTD_dictForceCopy) :
+                                     ZSTD_dictDefaultAttach;
+        return CCtxParams->attachDictPref;
+
+    case ZSTD_p_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        if (value>0) return ERROR(parameter_unsupported);
+        return 0;
+#else
+        return ZSTDMT_CCtxParam_setNbWorkers(CCtxParams, value);
+#endif
+
+    case ZSTD_p_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_jobSize, value);
+#endif
+
+    case ZSTD_p_overlapSizeLog :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_overlapSectionLog, value);
+#endif
+
+    case ZSTD_p_enableLongDistanceMatching :
+        CCtxParams->ldmParams.enableLdm = (value>0);
+        return CCtxParams->ldmParams.enableLdm;
+
+    case ZSTD_p_ldmHashLog :
+        if (value>0)   /* 0 ==> auto */
+            CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+        CCtxParams->ldmParams.hashLog = value;
+        return CCtxParams->ldmParams.hashLog;
+
+    case ZSTD_p_ldmMinMatch :
+        if (value>0)   /* 0 ==> default */
+            CLAMPCHECK(value, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX);
+        CCtxParams->ldmParams.minMatchLength = value;
+        return CCtxParams->ldmParams.minMatchLength;
+
+    case ZSTD_p_ldmBucketSizeLog :
+        if (value > ZSTD_LDM_BUCKETSIZELOG_MAX)
+            return ERROR(parameter_outOfBound);
+        CCtxParams->ldmParams.bucketSizeLog = value;
+        return CCtxParams->ldmParams.bucketSizeLog;
+
+    case ZSTD_p_ldmHashEveryLog :
+        if (value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+            return ERROR(parameter_outOfBound);
+        CCtxParams->ldmParams.hashEveryLog = value;
+        return CCtxParams->ldmParams.hashEveryLog;
+
+    default: return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value)
+{
+    return ZSTD_CCtxParam_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParam_getParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned* value)
+{
+    switch(param)
+    {
+    case ZSTD_p_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_p_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_p_windowLog :
+        *value = CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_p_hashLog :
+        *value = CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_p_chainLog :
+        *value = CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_p_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_p_minMatch :
+        *value = CCtxParams->cParams.searchLength;
+        break;
+    case ZSTD_p_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_p_compressionStrategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_p_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_p_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_p_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_p_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_p_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_p_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_p_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_p_overlapSizeLog :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->overlapSizeLog;
+        break;
+#endif
+    case ZSTD_p_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_p_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_p_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_p_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_p_ldmHashEveryLog :
+        *value = CCtxParams->ldmParams.hashEveryLog;
+        break;
+    default: return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  just applies `params` into `cctx`
+ *  no action is performed, parameters are merely stored.
+ *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ *    This is possible even if a compression is ongoing.
+ *    In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    if (cctx->cdict) return ERROR(stage_wrong);
+
+    cctx->requestedParams = *params;
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    if (cctx->staticSize) return ERROR(memory_allocation);  /* no malloc for static CCtx */
+    DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+    ZSTD_freeCDict(cctx->cdictLocal);  /* in case one already exists */
+    if (dict==NULL || dictSize==0) {   /* no dictionary mode */
+        cctx->cdictLocal = NULL;
+        cctx->cdict = NULL;
+    } else {
+        ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(&cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, dictSize);
+        cctx->cdictLocal = ZSTD_createCDict_advanced(
+                                dict, dictSize,
+                                dictLoadMethod, dictContentType,
+                                cParams, cctx->customMem);
+        cctx->cdict = cctx->cdictLocal;
+        if (cctx->cdictLocal == NULL)
+            return ERROR(memory_allocation);
+    }
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+      ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = cdict;
+    memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));  /* exclusive */
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+        ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = NULL;   /* prefix discards any prior cdict */
+    cctx->prefixDict.dict = prefix;
+    cctx->prefixDict.dictSize = prefixSize;
+    cctx->prefixDict.dictContentType = dictContentType;
+    return 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
+{
+    cctx->streamStage = zcss_init;
+    cctx->pledgedSrcSizePlusOne = 0;
+}
+
+size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = NULL;
+    return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+}
+
+/** ZSTD_checkCParams() :
+    control CParam values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+    CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+    CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    if ((U32)(cParams.strategy) > (U32)ZSTD_btultra)
+        return ERROR(parameter_unsupported);
+    return 0;
+}
+
+/** ZSTD_clampCParams() :
+ *  make CParam values within valid range.
+ *  @return : valid CParams */
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMP(val,min,max) {      \
+        if (val<min) val=min;        \
+        else if (val>max) val=max;   \
+    }
+    CLAMP(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMP(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+    CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    CLAMP(cParams.strategy, ZSTD_fast, ZSTD_btultra);
+    return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams_internal() :
+    optimize `cPar` for a given input (`srcSize` and `dictSize`).
+    mostly downsizing to reduce memory consumption and initialization latency.
+    Both `srcSize` and `dictSize` are optional (use 0 if unknown).
+    Note : cPar is assumed validated. Use ZSTD_checkCParams() to ensure this condition. */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize)
+{
+    static const U64 minSrcSize = 513; /* (1<<9) + 1 */
+    static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+    assert(ZSTD_checkCParams(cPar)==0);
+
+    if (dictSize && (srcSize+1<2) /* srcSize unknown */ )
+        srcSize = minSrcSize;  /* presumed small when there is a dictionary */
+    else if (srcSize == 0)
+        srcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* 0 == unknown : presumed large */
+
+    /* resize windowLog if input is small enough, to use less memory */
+    if ( (srcSize < maxWindowResize)
+      && (dictSize < maxWindowResize) )  {
+        U32 const tSize = (U32)(srcSize + dictSize);
+        static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+        U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+                            ZSTD_highbit32(tSize-1) + 1;
+        if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }
+    if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1;
+    {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cycleLog > cPar.windowLog)
+            cPar.chainLog -= (cycleLog - cPar.windowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+        cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* required for frame header */
+
+    return cPar;
+}
+
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
+{
+    cPar = ZSTD_clampCParams(cPar);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
+    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+    assert(!ZSTD_checkCParams(cParams));
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const U32 forCCtx)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+    size_t const optPotentialSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits)) * sizeof(U32)
+                          + (ZSTD_OPT_NUM+1) * (sizeof(ZSTD_match_t)+sizeof(ZSTD_optimal_t));
+    size_t const optSpace = (forCCtx && ((cParams->strategy == ZSTD_btopt) ||
+                                         (cParams->strategy == ZSTD_btultra)))
+                                ? optPotentialSpace
+                                : 0;
+    DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+                (U32)chainSize, (U32)hSize, (U32)h3Size);
+    return tableSpace + optSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    /* Estimate CCtx size is supported for single-threaded compression only. */
+    if (params->nbWorkers > 0) { return ERROR(GENERIC); }
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, 0, 0);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        U32    const divider = (cParams.searchLength==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const entropySpace = HUF_WORKSPACE_SIZE;
+        size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
+
+        size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams);
+        size_t const ldmSeqSpace = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq);
+
+        size_t const neededSpace = entropySpace + blockStateSpace + tokenSpace +
+                                   matchStateSize + ldmSpace + ldmSeqSpace;
+
+        DEBUGLOG(5, "sizeof(ZSTD_CCtx) : %u", (U32)sizeof(ZSTD_CCtx));
+        DEBUGLOG(5, "estimate workSpace : %u", (U32)neededSpace);
+        return sizeof(ZSTD_CCtx) + neededSpace;
+    }
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCCtxSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=1; level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    if (params->nbWorkers > 0) { return ERROR(GENERIC); }
+    {   size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << params->cParams.windowLog);
+        size_t const inBuffSize = ((size_t)1 << params->cParams.windowLog) + blockSize;
+        size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+        size_t const streamingSize = inBuffSize + outBuffSize;
+
+        return CCtxSize + streamingSize;
+    }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=1; level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_getFrameProgression(cctx->mtctx);
+    }
+#endif
+    {   ZSTD_frameProgression fp;
+        size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+                                cctx->inBuffPos - cctx->inToCompress;
+        if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+        assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+        fp.ingested = cctx->consumedSrcSize + buffered;
+        fp.consumed = cctx->consumedSrcSize;
+        fp.produced = cctx->producedCSize;
+        return fp;
+}   }
+
+
+static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
+                                  ZSTD_compressionParameters cParams2)
+{
+    return (cParams1.hashLog  == cParams2.hashLog)
+         & (cParams1.chainLog == cParams2.chainLog)
+         & (cParams1.strategy == cParams2.strategy)   /* opt parser space */
+         & ((cParams1.searchLength==3) == (cParams2.searchLength==3));  /* hashlog3 space */
+}
+
+/** The parameters are equivalent if ldm is not enabled in both sets or
+ *  all the parameters are equivalent. */
+static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1,
+                                    ldmParams_t ldmParams2)
+{
+    return (!ldmParams1.enableLdm && !ldmParams2.enableLdm) ||
+           (ldmParams1.enableLdm == ldmParams2.enableLdm &&
+            ldmParams1.hashLog == ldmParams2.hashLog &&
+            ldmParams1.bucketSizeLog == ldmParams2.bucketSizeLog &&
+            ldmParams1.minMatchLength == ldmParams2.minMatchLength &&
+            ldmParams1.hashEveryLog == ldmParams2.hashEveryLog);
+}
+
+typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e;
+
+/* ZSTD_sufficientBuff() :
+ * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
+ * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+                            ZSTD_buffered_policy_e buffPol2,
+                            ZSTD_compressionParameters cParams2,
+                            U64 pledgedSrcSize)
+{
+    size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
+    size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
+                (U32)windowSize2, cParams2.windowLog);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
+                (U32)blockSize2, (U32)blockSize1);
+    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+         & (neededBufferSize2 <= bufferSize1);
+}
+
+/** Equivalence for resetCCtx purposes */
+static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
+                                 ZSTD_CCtx_params params2,
+                                 size_t buffSize1, size_t blockSize1,
+                                 ZSTD_buffered_policy_e buffPol2,
+                                 U64 pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
+    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
+           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
+           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+}
+
+static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+    int i;
+    for (i = 0; i < ZSTD_REP_NUM; ++i)
+        bs->rep[i] = repStartValue[i];
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ * Invalidate all the matches in the match finder tables.
+ * Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+    ZSTD_window_clear(&ms->window);
+
+    ms->nextToUpdate = ms->window.dictLimit + 1;
+    ms->nextToUpdate3 = ms->window.dictLimit + 1;
+    ms->loadedDictEnd = 0;
+    ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
+}
+
+/*! ZSTD_continueCCtx() :
+ *  reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_CCtx_params params, U64 pledgedSrcSize)
+{
+    size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+    DEBUGLOG(4, "ZSTD_continueCCtx: re-use context in place");
+
+    cctx->blockSize = blockSize;   /* previous block size could be different even for same windowLog, due to pledgedSrcSize */
+    cctx->appliedParams = params;
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    cctx->consumedSrcSize = 0;
+    cctx->producedCSize = 0;
+    if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+        cctx->appliedParams.fParams.contentSizeFlag = 0;
+    DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+        (U32)pledgedSrcSize, cctx->appliedParams.fParams.contentSizeFlag);
+    cctx->stage = ZSTDcs_init;
+    cctx->dictID = 0;
+    if (params.ldmParams.enableLdm)
+        ZSTD_window_clear(&cctx->ldmState.window);
+    ZSTD_referenceExternalSequences(cctx, NULL, 0);
+    ZSTD_invalidateMatchState(&cctx->blockState.matchState);
+    ZSTD_reset_compressedBlockState(cctx->blockState.prevCBlock);
+    XXH64_reset(&cctx->xxhState, 0);
+    return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset } ZSTD_compResetPolicy_e;
+
+static void*
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      void* ptr,
+                const ZSTD_compressionParameters* cParams,
+                      ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+
+    assert(((size_t)ptr & 3) == 0);
+
+    ms->hashLog3 = hashLog3;
+    memset(&ms->window, 0, sizeof(ms->window));
+    ZSTD_invalidateMatchState(ms);
+
+    /* opt parser space */
+    if (forCCtx && ((cParams->strategy == ZSTD_btopt) | (cParams->strategy == ZSTD_btultra))) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (U32*)ptr;
+        ms->opt.litLengthFreq = ms->opt.litFreq + (1<<Litbits);
+        ms->opt.matchLengthFreq = ms->opt.litLengthFreq + (MaxLL+1);
+        ms->opt.offCodeFreq = ms->opt.matchLengthFreq + (MaxML+1);
+        ptr = ms->opt.offCodeFreq + (MaxOff+1);
+        ms->opt.matchTable = (ZSTD_match_t*)ptr;
+        ptr = ms->opt.matchTable + ZSTD_OPT_NUM+1;
+        ms->opt.priceTable = (ZSTD_optimal_t*)ptr;
+        ptr = ms->opt.priceTable + ZSTD_OPT_NUM+1;
+    }
+
+    /* table Space */
+    DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_noMemset);
+    assert(((size_t)ptr & 3) == 0);  /* ensure ptr is properly aligned */
+    if (crp!=ZSTDcrp_noMemset) memset(ptr, 0, tableSpace);   /* reset tables only */
+    ms->hashTable = (U32*)(ptr);
+    ms->chainTable = ms->hashTable + hSize;
+    ms->hashTable3 = ms->chainTable + chainSize;
+    ptr = ms->hashTable3 + h3Size;
+
+    assert(((size_t)ptr & 3) == 0);
+    return ptr;
+}
+
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 /* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128  /* when workspace is continuously too large
+                                         * during at least this number of times,
+                                         * context's memory usage is considered wasteful,
+                                         * because it's sized to handle a worst case scenario which rarely happens.
+                                         * In which case, resize it down to free some memory */
+
+/*! ZSTD_resetCCtx_internal() :
+    note : `params` are assumed fully validated at this stage */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+                                      ZSTD_CCtx_params params,
+                                      U64 pledgedSrcSize,
+                                      ZSTD_compResetPolicy_e const crp,
+                                      ZSTD_buffered_policy_e const zbuff)
+{
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u",
+                (U32)pledgedSrcSize, params.cParams.windowLog);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+
+    if (crp == ZSTDcrp_continue) {
+        if (ZSTD_equivalentParams(zc->appliedParams, params,
+                                zc->inBuffSize, zc->blockSize,
+                                zbuff, pledgedSrcSize)) {
+            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
+                        zc->appliedParams.cParams.windowLog, zc->blockSize);
+            zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
+            if (zc->workSpaceOversizedDuration <= ZSTD_WORKSPACETOOLARGE_MAXDURATION)
+                return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
+    }   }
+    DEBUGLOG(4, "ZSTD_equivalentParams()==0 -> reset CCtx");
+
+    if (params.ldmParams.enableLdm) {
+        /* Adjust long distance matching parameters */
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashEveryLog < 32);
+        zc->ldmState.hashPower = ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+    }
+
+    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+        U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
+        size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
+        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
+        void* ptr;   /* used to partition workSpace */
+
+        /* Check if workSpace is large enough, alloc a new one if needed */
+        {   size_t const entropySpace = HUF_WORKSPACE_SIZE;
+            size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
+            size_t const bufferSpace = buffInSize + buffOutSize;
+            size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams);
+            size_t const ldmSeqSpace = maxNbLdmSeq * sizeof(rawSeq);
+
+            size_t const neededSpace = entropySpace + blockStateSpace + ldmSpace +
+                                       ldmSeqSpace + matchStateSize + tokenSpace +
+                                       bufferSpace;
+
+            int const workSpaceTooSmall = zc->workSpaceSize < neededSpace;
+            int const workSpaceTooLarge = zc->workSpaceSize > ZSTD_WORKSPACETOOLARGE_FACTOR * neededSpace;
+            int const workSpaceWasteful = workSpaceTooLarge && (zc->workSpaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION);
+            zc->workSpaceOversizedDuration = workSpaceTooLarge ? zc->workSpaceOversizedDuration+1 : 0;
+
+            DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers",
+                        neededSpace>>10, matchStateSize>>10, bufferSpace>>10);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (workSpaceTooSmall || workSpaceWasteful) {
+                DEBUGLOG(4, "Need to resize workSpaceSize from %zuKB to %zuKB",
+                            zc->workSpaceSize >> 10,
+                            neededSpace >> 10);
+                /* static cctx : no resize, error out */
+                if (zc->staticSize) return ERROR(memory_allocation);
+
+                zc->workSpaceSize = 0;
+                ZSTD_free(zc->workSpace, zc->customMem);
+                zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+                if (zc->workSpace == NULL) return ERROR(memory_allocation);
+                zc->workSpaceSize = neededSpace;
+                zc->workSpaceOversizedDuration = 0;
+                ptr = zc->workSpace;
+
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
+                assert(((size_t)zc->workSpace & 3) == 0);   /* ensure correct alignment */
+                assert(zc->workSpaceSize >= 2 * sizeof(ZSTD_compressedBlockState_t));
+                zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)zc->workSpace;
+                zc->blockState.nextCBlock = zc->blockState.prevCBlock + 1;
+                ptr = zc->blockState.nextCBlock + 1;
+                zc->entropyWorkspace = (U32*)ptr;
+        }   }
+
+        /* init params */
+        zc->appliedParams = params;
+        zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+        zc->consumedSrcSize = 0;
+        zc->producedCSize = 0;
+        if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            zc->appliedParams.fParams.contentSizeFlag = 0;
+        DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+            (U32)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+        zc->blockSize = blockSize;
+
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+
+        ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+        ptr = zc->entropyWorkspace + HUF_WORKSPACE_SIZE_U32;
+
+        /* ldm hash table */
+        /* initialize bucketOffsets table later for pointer alignment */
+        if (params.ldmParams.enableLdm) {
+            size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog;
+            memset(ptr, 0, ldmHSize * sizeof(ldmEntry_t));
+            assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */
+            zc->ldmState.hashTable = (ldmEntry_t*)ptr;
+            ptr = zc->ldmState.hashTable + ldmHSize;
+            zc->ldmSequences = (rawSeq*)ptr;
+            ptr = zc->ldmSequences + maxNbLdmSeq;
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            memset(&zc->ldmState.window, 0, sizeof(zc->ldmState.window));
+        }
+        assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */
+
+        ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
+
+        /* sequences storage */
+        zc->seqStore.sequencesStart = (seqDef*)ptr;
+        ptr = zc->seqStore.sequencesStart + maxNbSeq;
+        zc->seqStore.llCode = (BYTE*) ptr;
+        zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+        zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+        zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+        ptr = zc->seqStore.litStart + blockSize;
+
+        /* ldm bucketOffsets table */
+        if (params.ldmParams.enableLdm) {
+            size_t const ldmBucketSize =
+                  ((size_t)1) << (params.ldmParams.hashLog -
+                                  params.ldmParams.bucketSizeLog);
+            memset(ptr, 0, ldmBucketSize);
+            zc->ldmState.bucketOffsets = (BYTE*)ptr;
+            ptr = zc->ldmState.bucketOffsets + ldmBucketSize;
+            ZSTD_window_clear(&zc->ldmState.window);
+        }
+        ZSTD_referenceExternalSequences(zc, NULL, 0);
+
+        /* buffers */
+        zc->inBuffSize = buffInSize;
+        zc->inBuff = (char*)ptr;
+        zc->outBuffSize = buffOutSize;
+        zc->outBuff = zc->inBuff + buffInSize;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+    assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    /* We have a choice between copying the dictionary context into the working
+     * context, or referencing the dictionary context from the working context
+     * in-place. We decide here which strategy to use. */
+    const U64 attachDictSizeCutoffs[(unsigned)ZSTD_btultra+1] = {
+        8 KB, /* unused */
+        8 KB, /* ZSTD_fast */
+        16 KB, /* ZSTD_dfast */
+        32 KB, /* ZSTD_greedy */
+        32 KB, /* ZSTD_lazy */
+        32 KB, /* ZSTD_lazy2 */
+        32 KB, /* ZSTD_btlazy2 */
+        32 KB, /* ZSTD_btopt */
+        8 KB /* ZSTD_btultra */
+    };
+    const int attachDict = ( pledgedSrcSize <= attachDictSizeCutoffs[cdict->cParams.strategy]
+                          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+                          || params.attachDictPref == ZSTD_dictForceAttach )
+                        && params.attachDictPref != ZSTD_dictForceCopy
+                        && !params.forceWindow /* dictMatchState isn't correctly
+                                                * handled in _enforceMaxDist */
+                        && ZSTD_equivalentCParams(cctx->appliedParams.cParams,
+                                                  cdict->cParams);
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Copy only compression parameters related to tables. */
+        params.cParams = cdict->cParams;
+        params.cParams.windowLog = windowLog;
+        ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                attachDict ? ZSTDcrp_continue : ZSTDcrp_noMemset,
+                                zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog);
+    }
+
+    if (attachDict) {
+        const U32 cdictLen = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictLen) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictLen;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+        }
+    } else {
+        DEBUGLOG(4, "copying dictionary into context");
+        /* copy tables */
+        {   size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog);
+            size_t const hSize =  (size_t)1 << cdict->cParams.hashLog;
+            size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
+            assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+            assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
+            assert((U32*)cdict->matchState.chainTable == (U32*)cdict->matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+            assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
+            memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
+        }
+
+        /* Zero the hashTable3, since the cdict never fills it */
+        {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
+            assert(cdict->matchState.hashLog3 == 0);
+            memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+        }
+
+        /* copy dictionary offsets */
+        {
+            ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+            ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+            dstMatchState->window       = srcMatchState->window;
+            dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+            dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
+            dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+        }
+    }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  The "context", in this case, refers to the hash and chain tables,
+ *  entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                            const ZSTD_CCtx* srcCCtx,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+    if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong);
+
+    memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = srcCCtx->appliedParams.cParams;
+        params.fParams = fParams;
+        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
+                                ZSTDcrp_noMemset, zbuff);
+        assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+        assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+        assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+        assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+        assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+    }
+
+    /* copy tables */
+    {   size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog);
+        size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+        size_t const h3Size = (size_t)1 << srcCCtx->blockState.matchState.hashLog3;
+        size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+        assert((U32*)dstCCtx->blockState.matchState.chainTable == (U32*)dstCCtx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)dstCCtx->blockState.matchState.hashTable3 == (U32*)dstCCtx->blockState.matchState.chainTable + chainSize);
+        memcpy(dstCCtx->blockState.matchState.hashTable, srcCCtx->blockState.matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
+    }
+
+    /* copy dictionary offsets */
+    {
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    dstCCtx->dictID = srcCCtx->dictID;
+
+    /* copy block state */
+    memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0);
+    ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+                                fParams, pledgedSrcSize,
+                                zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
+    int cellNb = 0;
+    int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
+            if (table[cellNb] < reducerValue) table[cellNb] = 0;
+            else table[cellNb] -= reducerValue;
+            cellNb++;
+    }   }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    {   U32 const hSize = (U32)1 << zc->appliedParams.cParams.hashLog;
+        ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+    }
+
+    if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
+        U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
+        if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+    }
+
+    if (ms->hashLog3) {
+        U32 const h3Size = (U32)1 << ms->hashLog3;
+        ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+    }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+    return ZSTD_blockHeaderSize+srcSize;
+}
+
+
+static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    if (srcSize + flSize > dstCapacity) return ERROR(dstSize_tooSmall);
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    memcpy(ostart + flSize, src, srcSize);
+    return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    return flSize+1;
+}
+
+
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+static size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat==ZSTD_btultra) ? 7 : 6;
+    return (srcSize >> minlog) + 2;
+}
+
+static size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_strategy strategy, int disableLiteralCompression,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32* workspace, const int bmi2)
+{
+    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i)",
+                disableLiteralCompression);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+
+    if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
+    {   HUF_repeat repeat = prevHuf->repeatMode;
+        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2)
+                                : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            hType = set_repeat;
+        }
+    }
+
+    if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+    if (cLitSize==1) {
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextHuf->repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    return lhSize+cLitSize;
+}
+
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const mlv = sequences[u].matchLength;
+        llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+        mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+    }
+    if (seqStorePtr->longLengthID==1)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthID==2)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabiltyLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabiltyLog256[norm];
+    }
+    return cost >> 8;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                                    unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = norm[s] != -1 ? norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabiltyLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+static size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq, max));
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+MEM_STATIC symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
+{
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
+        DEBUGLOG(5, "Selected set_rle");
+        return set_rle;
+    }
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (U32)basicCost, (U32)repeatCost, (U32)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+MEM_STATIC size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                U32* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* workspace, size_t workspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    const BYTE* const oend = op + dstCapacity;
+
+    switch (type) {
+    case set_rle:
+        *op = codeTable[0];
+        CHECK_F(FSE_buildCTable_rle(nextCTable, (BYTE)max));
+        return 1;
+    case set_repeat:
+        memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        CHECK_F(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, workspace, workspaceSize));  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        S16 norm[MaxSeq + 1];
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max));
+        {   size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog);   /* overflow protected */
+            if (FSE_isError(NCountSize)) return NCountSize;
+            CHECK_F(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, workspace, workspaceSize));
+            return NCountSize;
+        }
+    }
+    default: return assert(0), ERROR(GENERIC);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    CHECK_E(BIT_initCStream(&blockStream, dst, dstCapacity), dstSize_tooSmall); /* not enough space remaining */
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        sequences[n].litLength,
+                        sequences[n].matchLength + MINMATCH,
+                        sequences[n].offset);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offset, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        if (streamSize==0) return ERROR(dstSize_tooSmall);   /* not enough space */
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
+
+MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
+                              ZSTD_entropyCTables_t const* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                              ZSTD_CCtx_params const* cctxParams,
+                              void* dst, size_t dstCapacity, U32* workspace,
+                              const int bmi2)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    U32 count[MaxSeq+1];
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* seqHead;
+    BYTE* lastNCount = NULL;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const litSize = seqStorePtr->lit - literals;
+        int const disableLiteralCompression = (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+        size_t const cSize = ZSTD_compressLiterals(
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy, disableLiteralCompression,
+                                    op, dstCapacity,
+                                    literals, litSize,
+                                    workspace, bmi2);
+        if (ZSTD_isError(cSize))
+          return cSize;
+        assert(cSize <= dstCapacity);
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/) return ERROR(dstSize_tooSmall);
+    if (nbSeq < 0x7F)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+        /* Copy the old tables over as if we repeated them */
+        memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    /* build CTable for Literal Lengths */
+    {   U32 max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);   /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, count, max, mostFrequent, nbSeq, LLFSELog, prevEntropy->fse.litlengthCTable, LL_defaultNorm, LL_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+                                                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                                    prevEntropy->fse.litlengthCTable, sizeof(prevEntropy->fse.litlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            if (LLtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+    }   }
+    /* build CTable for Offsets */
+    {   U32 max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, count, max, mostFrequent, nbSeq, OffFSELog, prevEntropy->fse.offcodeCTable, OF_defaultNorm, OF_defaultNormLog, defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+                                                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                                    prevEntropy->fse.offcodeCTable, sizeof(prevEntropy->fse.offcodeCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            if (Offtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+    }   }
+    /* build CTable for MatchLengths */
+    {   U32 max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);   /* can't fail */
+        DEBUGLOG(5, "Building ML table");
+        nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, count, max, mostFrequent, nbSeq, MLFSELog, prevEntropy->fse.matchlengthCTable, ML_defaultNorm, ML_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+                                                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                                    prevEntropy->fse.matchlengthCTable, sizeof(prevEntropy->fse.matchlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            if (MLtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+    }   }
+
+    *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        if (ZSTD_isError(bitstreamSize)) return bitstreamSize;
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() recieves a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastNCount && (op - lastNCount) < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(op - lastNCount == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+    }
+
+    return op - ostart;
+}
+
+MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              size_t srcSize, U32* workspace, int bmi2)
+{
+    size_t const cSize = ZSTD_compressSequences_internal(
+            seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity,
+            workspace, bmi2);
+    if (cSize == 0) return 0;
+    /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+     * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+     */
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+        return 0;  /* block not compressed */
+    if (ZSTD_isError(cSize)) return cSize;
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
+    }
+
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (nextEntropy->fse.offcode_repeatMode == FSE_repeat_valid)
+        nextEntropy->fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
+{
+    static const ZSTD_blockCompressor blockCompressor[3][(unsigned)ZSTD_btultra+1] = {
+        { ZSTD_compressBlock_fast  /* default for 0 */,
+          ZSTD_compressBlock_fast,
+          ZSTD_compressBlock_doubleFast,
+          ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy,
+          ZSTD_compressBlock_lazy2,
+          ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt,
+          ZSTD_compressBlock_btultra },
+        { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_compressBlock_doubleFast_extDict,
+          ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,
+          ZSTD_compressBlock_lazy2_extDict,
+          ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict,
+          ZSTD_compressBlock_btultra_extDict },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_compressBlock_doubleFast_dictMatchState,
+          ZSTD_compressBlock_greedy_dictMatchState,
+          ZSTD_compressBlock_lazy_dictMatchState,
+          ZSTD_compressBlock_lazy2_dictMatchState,
+          ZSTD_compressBlock_btlazy2_dictMatchState,
+          ZSTD_compressBlock_btopt_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState }
+    };
+    ZSTD_blockCompressor selectedCompressor;
+    ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+    assert((U32)strat >= (U32)ZSTD_fast);
+    assert((U32)strat <= (U32)ZSTD_btultra);
+    selectedCompressor = blockCompressor[(int)dictMode][(U32)strat];
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+                                   const BYTE* anchor, size_t lastLLSize)
+{
+    memcpy(seqStorePtr->lit, anchor, lastLLSize);
+    seqStorePtr->lit += lastLLSize;
+}
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthID = 0;
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                                        void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%zu, dictLimit=%u, nextToUpdate=%u)",
+                dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength);
+        return 0;   /* don't even attempt compression below a certain srcSize */
+    }
+    ZSTD_resetSeqStore(&(zc->seqStore));
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;   /* required for optimal parser to read stats from dictionary */
+
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent, and when that stops being the case, the dict
+     * must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
+
+    /* limited update after a very long match */
+    {   const BYTE* const base = ms->window.base;
+        const BYTE* const istart = (const BYTE*)src;
+        const U32 current = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
+        if (current > ms->nextToUpdate + 384)
+            ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
+    }
+
+    /* select and store sequences */
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
+        size_t lastLLSize;
+        {   int i;
+            for (i = 0; i < ZSTD_REP_NUM; ++i)
+                zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+        }
+        if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+            assert(!zc->appliedParams.ldmParams.enableLdm);
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&zc->externSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       &zc->appliedParams.cParams,
+                                       src, srcSize);
+            assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+        } else if (zc->appliedParams.ldmParams.enableLdm) {
+            rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
+
+            ldmSeqStore.seq = zc->ldmSequences;
+            ldmSeqStore.capacity = zc->maxNbLdmSequences;
+            /* Updates ldmSeqStore.size */
+            CHECK_F(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+                                               &zc->appliedParams.ldmParams,
+                                               src, srcSize));
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&ldmSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       &zc->appliedParams.cParams,
+                                       src, srcSize);
+            assert(ldmSeqStore.pos == ldmSeqStore.size);
+        } else {   /* not long range mode */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize);
+        }
+        {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+            ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+    }   }
+
+    /* encode sequences and literals */
+    {   size_t const cSize = ZSTD_compressSequences(&zc->seqStore,
+                                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+                                &zc->appliedParams,
+                                dst, dstCapacity,
+                                srcSize, zc->entropyWorkspace, zc->bmi2);
+        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
+        /* confirm repcodes and entropy tables */
+        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+            zc->blockState.nextCBlock = tmp;
+        }
+        return cSize;
+    }
+}
+
+
+/*! ZSTD_compress_frameChunk() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+    assert(cctx->appliedParams.cParams.windowLog <= 31);
+
+    DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (U32)blockSize);
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+        if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+            return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
+        if (remaining < blockSize) blockSize = remaining;
+
+        if (ZSTD_window_needOverflowCorrection(ms->window, ip + blockSize)) {
+            U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
+            U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+            ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+
+            ZSTD_reduceIndex(cctx, correction);
+            if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+            else ms->nextToUpdate -= correction;
+            ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
+        }
+        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+        {   size_t cSize = ZSTD_compressBlock_internal(cctx,
+                                op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                                ip, blockSize);
+            if (ZSTD_isError(cSize)) return cSize;
+
+            if (cSize == 0) {  /* block is not compressible */
+                U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
+                if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+                MEM_writeLE32(op, cBlockHeader24);   /* 4th byte will be overwritten */
+                memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+                cSize = ZSTD_blockHeaderSize + blockSize;
+            } else {
+                U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                MEM_writeLE24(op, cBlockHeader24);
+                cSize += ZSTD_blockHeaderSize;
+            }
+
+            ip += blockSize;
+            assert(remaining >= blockSize);
+            remaining -= blockSize;
+            op += cSize;
+            assert(dstCapacity >= cSize);
+            dstCapacity -= cSize;
+            DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+                        (U32)cSize);
+    }   }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return op-ostart;
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    ZSTD_CCtx_params params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params.fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+    U32   const checksumFlag = params.fParams.checksumFlag>0;
+    U32   const windowSize = (U32)1 << params.cParams.windowLog;
+    U32   const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params.fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0;  /* 0-3 */
+    BYTE  const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos=0;
+
+    assert(!(params.fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
+    if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
+    DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+                !params.fParams.noDictIDFlag, dictID,  dictIDSizeCode);
+
+    if (params.format == ZSTD_f_zstd1) {
+        MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+        pos = 4;
+    }
+    op[pos++] = frameHeaderDecriptionByte;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapcity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+    if (dstCapacity < ZSTD_blockHeaderSize) return ERROR(dstSize_tooSmall);
+    {   U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1);  /* 0 size */
+        MEM_writeLE24(dst, cBlockHeader24);
+        return ZSTD_blockHeaderSize;
+    }
+}
+
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+    if (cctx->stage != ZSTDcs_init)
+        return ERROR(stage_wrong);
+    if (cctx->appliedParams.ldmParams.enableLdm)
+        return ERROR(parameter_unsupported);
+    cctx->externSeqStore.seq = seq;
+    cctx->externSeqStore.size = nbSeq;
+    cctx->externSeqStore.capacity = nbSeq;
+    cctx->externSeqStore.pos = 0;
+    return 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    ZSTD_matchState_t* ms = &cctx->blockState.matchState;
+    size_t fhSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+                cctx->stage, (U32)srcSize);
+    if (cctx->stage==ZSTDcs_created) return ERROR(stage_wrong);   /* missing init (ZSTD_compressBegin) */
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams,
+                                       cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (!srcSize) return fhSize;  /* do not generate an empty block if no input */
+
+    if (!ZSTD_window_update(&ms->window, src, srcSize)) {
+        ms->nextToUpdate = ms->window.dictLimit;
+    }
+    if (cctx->appliedParams.ldmParams.enableLdm)
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize);
+    {   size_t const cSize = frame ?
+                             ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize);
+        if (ZSTD_isError(cSize)) return cSize;
+        cctx->consumedSrcSize += srcSize;
+        cctx->producedCSize += (cSize + fhSize);
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+            if (cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne) {
+                DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+                    (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
+                return ERROR(srcSize_wrong);
+            }
+        }
+        return cSize + fhSize;
+    }
+}
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (U32)srcSize);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+    assert(!ZSTD_checkCParams(cParams));
+    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const BYTE* const ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    ZSTD_compressionParameters const* cParams = &params->cParams;
+
+    ZSTD_window_update(&ms->window, src, srcSize);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    switch(params->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, cParams, iend, dtlm);
+        break;
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, cParams, iend, dtlm);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE);
+        break;
+
+    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend);
+        break;
+
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    ms->nextToUpdate = (U32)(iend - ms->window.base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+    U32 s;
+    if (dictMaxSymbolValue < maxSymbolValue) return ERROR(dictionary_corrupted);
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        if (normalizedCounter[s] == 0) return ERROR(dictionary_corrupted);
+    }
+    return 0;
+}
+
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    size_t dictID;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize > 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+    dictPtr += 4;   /* skip magic number */
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
+    dictPtr += 4;
+
+    {   unsigned maxSymbolValue = 255;
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
+        if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
+        if (maxSymbolValue < 255) return ERROR(dictionary_corrupted);
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        /* fill all offset symbols to avoid garbage at end of table */
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        /* Every match length code must have non-zero probability */
+        CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        /* Every literal length code must have non-zero probability */
+        CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    bs->rep[0] = MEM_readLE32(dictPtr+0);
+    bs->rep[1] = MEM_readLE32(dictPtr+4);
+    bs->rep[2] = MEM_readLE32(dictPtr+8);
+    dictPtr += 12;
+
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable */
+        CHECK_F (ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+        /* All repCodes must be <= dictContentSize and != 0*/
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                if (bs->rep[u] == 0) return ERROR(dictionary_corrupted);
+                if (bs->rep[u] > dictContentSize) return ERROR(dictionary_corrupted);
+        }   }
+
+        bs->entropy.huf.repeatMode = HUF_repeat_valid;
+        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize, dtlm));
+        return dictID;
+    }
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : dictID, or an error code */
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               void* workspace)
+{
+    DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+    if ((dict==NULL) || (dictSize<=8)) return 0;
+
+    ZSTD_reset_compressedBlockState(bs);
+
+    /* dict restricted modes */
+    if (dictContentType == ZSTD_dct_rawContent)
+        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
+
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+        if (dictContentType == ZSTD_dct_auto) {
+            DEBUGLOG(4, "raw content dictionary detected");
+            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
+        }
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_wrong);
+        assert(0);   /* impossible */
+    }
+
+    /* dict as full zstd dictionary */
+    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, dtlm, workspace);
+}
+
+/*! ZSTD_compressBegin_internal() :
+ * @return : 0, or an error code */
+size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                             ZSTD_dictContentType_e dictContentType,
+                             ZSTD_dictTableLoadMethod_e dtlm,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                             ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    if (cdict && cdict->dictContentSize>0) {
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+
+    CHECK_F( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                     ZSTDcrp_continue, zbuff) );
+    {
+        size_t const dictID = ZSTD_compress_insertDictionary(
+                cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                &params, dict, dictSize, dictContentType, dtlm, cctx->entropyWorkspace);
+        if (ZSTD_isError(dictID)) return dictID;
+        assert(dictID <= (size_t)(U32)-1);
+        cctx->dictID = (U32)dictID;
+    }
+    return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params,
+                                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params.cParams.windowLog);
+    /* compression parameters verification and optimization */
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    return ZSTD_compressBegin_internal(cctx,
+                                       dict, dictSize, dictContentType, dtlm,
+                                       cdict,
+                                       params, pledgedSrcSize,
+                                       ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    return ZSTD_compressBegin_advanced_internal(cctx,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                            NULL /*cdict*/,
+                                            cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (U32)dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                                       cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    size_t fhSize = 0;
+
+    DEBUGLOG(4, "ZSTD_writeEpilogue");
+    if (cctx->stage == ZSTDcs_created) return ERROR(stage_wrong);  /* init missing */
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams, 0, 0);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", checksum);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx,
+                                dst, dstCapacity, src, srcSize,
+                                1 /* frame mode */, 1 /* last chunk */);
+    if (ZSTD_isError(cSize)) return cSize;
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    if (ZSTD_isError(endResult)) return endResult;
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+        DEBUGLOG(4, "end of frame : controlling src size");
+        if (cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1) {
+            DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize = %u",
+                (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
+            return ERROR(srcSize_wrong);
+    }   }
+    return cSize + endResult;
+}
+
+
+static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const void* dict,size_t dictSize,
+                                      ZSTD_parameters params)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    DEBUGLOG(4, "ZSTD_compress_internal");
+    return ZSTD_compress_advanced_internal(cctx,
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           cctxParams);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced");
+    CHECK_F(ZSTD_checkCParams(params.cParams));
+    return ZSTD_compress_internal(cctx,
+                                  dst, dstCapacity,
+                                  src, srcSize,
+                                  dict, dictSize,
+                                  params);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+        ZSTD_CCtx* cctx,
+        void* dst, size_t dstCapacity,
+        const void* src, size_t srcSize,
+        const void* dict,size_t dictSize,
+        ZSTD_CCtx_params params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (U32)srcSize);
+    CHECK_F( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) );
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize + (!srcSize), dict ? dictSize : 0);
+    ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    assert(params.fParams.contentSizeFlag == 1);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, cctxParams);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (U32)srcSize);
+    assert(cctx != NULL);
+    return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
+{
+    size_t result;
+    ZSTD_CCtx ctxBody;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+        size_t dictSize, ZSTD_compressionParameters cParams,
+        ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (U32)sizeof(ZSTD_CDict));
+    return sizeof(ZSTD_CDict) + HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+           + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    DEBUGLOG(5, "sizeof(*cdict) : %u", (U32)sizeof(*cdict));
+    return cdict->workspaceSize + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict);
+}
+
+static size_t ZSTD_initCDict_internal(
+                    ZSTD_CDict* cdict,
+              const void* dictBuffer, size_t dictSize,
+                    ZSTD_dictLoadMethod_e dictLoadMethod,
+                    ZSTD_dictContentType_e dictContentType,
+                    ZSTD_compressionParameters cParams)
+{
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (U32)dictContentType);
+    assert(!ZSTD_checkCParams(cParams));
+    cdict->cParams = cParams;
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+        cdict->dictBuffer = NULL;
+        cdict->dictContent = dictBuffer;
+    } else {
+        void* const internalBuffer = ZSTD_malloc(dictSize, cdict->customMem);
+        cdict->dictBuffer = internalBuffer;
+        cdict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        memcpy(internalBuffer, dictBuffer, dictSize);
+    }
+    cdict->dictContentSize = dictSize;
+
+    /* Reset the state to no dictionary */
+    ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+    {   void* const end = ZSTD_reset_matchState(
+                &cdict->matchState,
+                (U32*)cdict->workspace + HUF_WORKSPACE_SIZE_U32,
+                &cParams, ZSTDcrp_continue, /* forCCtx */ 0);
+        assert(end == (char*)cdict->workspace + cdict->workspaceSize);
+        (void)end;
+    }
+    /* (Maybe) load the dictionary
+     * Skips loading the dictionary if it is <= 8 bytes.
+     */
+    {   ZSTD_CCtx_params params;
+        memset(&params, 0, sizeof(params));
+        params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        params.fParams.contentSizeFlag = 1;
+        params.cParams = cParams;
+        {   size_t const dictID = ZSTD_compress_insertDictionary(
+                    &cdict->cBlockState, &cdict->matchState, &params,
+                    cdict->dictContent, cdict->dictContentSize,
+                    dictContentType, ZSTD_dtlm_full, cdict->workspace);
+            if (ZSTD_isError(dictID)) return dictID;
+            assert(dictID <= (size_t)(U32)-1);
+            cdict->dictID = (U32)dictID;
+        }
+    }
+
+    return 0;
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
+{
+    DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (U32)dictContentType);
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_CDict* const cdict = (ZSTD_CDict*)ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+        size_t const workspaceSize = HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+        void* const workspace = ZSTD_malloc(workspaceSize, customMem);
+
+        if (!cdict || !workspace) {
+            ZSTD_free(cdict, customMem);
+            ZSTD_free(workspace, customMem);
+            return NULL;
+        }
+        cdict->customMem = customMem;
+        cdict->workspace = workspace;
+        cdict->workspaceSize = workspaceSize;
+        if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                        dictBuffer, dictSize,
+                                        dictLoadMethod, dictContentType,
+                                        cParams) )) {
+            ZSTD_freeCDict(cdict);
+            return NULL;
+        }
+
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->customMem;
+        ZSTD_free(cdict->workspace, cMem);
+        ZSTD_free(cdict->dictBuffer, cMem);
+        ZSTD_free(cdict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ *  Generate a digested dictionary in provided memory area.
+ *  workspace: The memory area to emplace the dictionary into.
+ *             Provided pointer must 8-bytes aligned.
+ *             It must outlive dictionary usage.
+ *  workspaceSize: Use ZSTD_estimateCDictSize()
+ *                 to determine how large workspace must be.
+ *  cParams : use ZSTD_getCParams() to transform a compression level
+ *            into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ *  Note : there is no corresponding "free" function.
+ *         Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+                                 void* workspace, size_t workspaceSize,
+                           const void* dict, size_t dictSize,
+                                 ZSTD_dictLoadMethod_e dictLoadMethod,
+                                 ZSTD_dictContentType_e dictContentType,
+                                 ZSTD_compressionParameters cParams)
+{
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+    size_t const neededSize = sizeof(ZSTD_CDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize)
+                            + HUF_WORKSPACE_SIZE + matchStateSize;
+    ZSTD_CDict* const cdict = (ZSTD_CDict*) workspace;
+    void* ptr;
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+        (U32)workspaceSize, (U32)neededSize, (U32)(workspaceSize < neededSize));
+    if (workspaceSize < neededSize) return NULL;
+
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        memcpy(cdict+1, dict, dictSize);
+        dict = cdict+1;
+        ptr = (char*)workspace + sizeof(ZSTD_CDict) + dictSize;
+    } else {
+        ptr = cdict+1;
+    }
+    cdict->workspace = ptr;
+    cdict->workspaceSize = HUF_WORKSPACE_SIZE + matchStateSize;
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType,
+                                              cParams) ))
+        return NULL;
+
+    return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+    assert(cdict != NULL);
+    return cdict->cParams;
+}
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced");
+    if (cdict==NULL) return ERROR(dictionary_wrong);
+    {   ZSTD_CCtx_params params = cctx->requestedParams;
+        params.cParams = ZSTD_getCParamsFromCDict(cdict);
+        /* Increase window log to fit the entire dictionary and source if the
+         * source size is known. Limit the increase to 19, which is the
+         * window log for compression level 1 with the largest source size.
+         */
+        if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+            U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+            U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+            params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog);
+        }
+        params.fParams = fParams;
+        return ZSTD_compressBegin_internal(cctx,
+                                           NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                           cdict,
+                                           params, pledgedSrcSize,
+                                           ZSTDb_not_buffered);
+    }
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * pledgedSrcSize=0 means "unknown"
+ * if pledgedSrcSize>0, it will enable contentSizeFlag */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    CHECK_F (ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize));   /* will check if cdict != NULL */
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createCStream");
+    return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{   /* CStream and CCtx are now same object */
+    return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    return ZSTD_freeCCtx(zcs);   /* same object */
+}
+
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
+                    const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
+                    const ZSTD_CDict* const cdict,
+                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    CHECK_F( ZSTD_compressBegin_internal(cctx,
+                                         dict, dictSize, dictContentType, ZSTD_dtlm_fast,
+                                         cdict,
+                                         params, pledgedSrcSize,
+                                         ZSTDb_buffered) );
+
+    cctx->inToCompress = 0;
+    cctx->inBuffPos = 0;
+    cctx->inBuffTarget = cctx->blockSize
+                      + (cctx->blockSize == pledgedSrcSize);   /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */
+    cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+    cctx->streamStage = zcss_load;
+    cctx->frameEnded = 0;
+    return 0;   /* ready to go */
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params params = zcs->requestedParams;
+    DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    params.fParams.contentSizeFlag = 1;
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
+}
+
+/*! ZSTD_initCStream_internal() :
+ *  Note : for lib/compress only. Used by zstdmt_compress.c.
+ *  Assumption 1 : params are valid
+ *  Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_internal");
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    if (dict && dictSize >= 8) {
+        DEBUGLOG(4, "loading dictionary of size %u", (U32)dictSize);
+        if (zcs->staticSize) {   /* static CCtx : never uses malloc */
+            /* incompatible with internal cdict creation */
+            return ERROR(memory_allocation);
+        }
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                            ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                            params.cParams, zcs->customMem);
+        zcs->cdict = zcs->cdictLocal;
+        if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        if (cdict) {
+            params.cParams = ZSTD_getCParamsFromCDict(cdict);  /* cParams are enforced from cdict; it includes windowLog */
+        }
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = NULL;
+        zcs->cdict = cdict;
+    }
+
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                            const ZSTD_CDict* cdict,
+                                            ZSTD_frameParameters fParams,
+                                            unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+    if (!cdict) return ERROR(dictionary_wrong); /* cannot handle NULL cdict (does not know what to do) */
+    {   ZSTD_CCtx_params params = zcs->requestedParams;
+        params.cParams = ZSTD_getCParamsFromCDict(cdict);
+        params.fParams = fParams;
+        return ZSTD_initCStream_internal(zcs,
+                                NULL, 0, cdict,
+                                params, pledgedSrcSize);
+    }
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /* contentSizeFlag */, 0 /* checksum */, 0 /* hideDictID */ };
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+    return ZSTD_initCStream_usingCDict_advanced(zcs, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);  /* note : will check that cdict != NULL */
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dm_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_advanced: pledgedSrcSize=%u, flag=%u",
+                (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
+    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
+    }
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
+    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream");
+    return ZSTD_initCStream_srcSize(zcs, compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+/*======   Compression   ======*/
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length) memcpy(dst, src, length);
+    return length;
+}
+
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants and *compress_generic()
+ *  non-static, because can be called from zstdmt_compress.c
+ * @return : hint size for next input */
+size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                   ZSTD_outBuffer* output,
+                                   ZSTD_inBuffer* input,
+                                   ZSTD_EndDirective const flushMode)
+{
+    const char* const istart = (const char*)input->src;
+    const char* const iend = istart + input->size;
+    const char* ip = istart + input->pos;
+    char* const ostart = (char*)output->dst;
+    char* const oend = ostart + output->size;
+    char* op = ostart + output->pos;
+    U32 someMoreWork = 1;
+
+    /* check expectations */
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (U32)flushMode);
+    assert(zcs->inBuff != NULL);
+    assert(zcs->inBuffSize > 0);
+    assert(zcs->outBuff !=  NULL);
+    assert(zcs->outBuffSize > 0);
+    assert(output->pos <= output->size);
+    assert(input->pos <= input->size);
+
+    while (someMoreWork) {
+        switch(zcs->streamStage)
+        {
+        case zcss_init:
+            /* call ZSTD_initCStream() first ! */
+            return ERROR(init_missing);
+
+        case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip))  /* enough dstCapacity */
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : %u", (U32)cSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_CCtx_reset(zcs);
+                someMoreWork = 0; break;
+            }
+            /* complete loading into inBuffer */
+            {   size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(
+                                        zcs->inBuff + zcs->inBuffPos, toLoad,
+                                        ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                ip += loaded;
+                if ( (flushMode == ZSTD_e_continue)
+                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                    /* not enough input to fill full block : stop here */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (zcs->inBuffPos == zcs->inToCompress) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+            {   void* cDst;
+                size_t cSize;
+                size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+                size_t oSize = oend-op;
+                unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                if (oSize >= ZSTD_compressBound(iSize))
+                    cDst = op;   /* compress into output buffer, to skip flush stage */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                cSize = lastBlock ?
+                        ZSTD_compressEnd(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize) :
+                        ZSTD_compressContinue(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                zcs->frameEnded = lastBlock;
+                /* prepare next block */
+                zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                if (zcs->inBuffTarget > zcs->inBuffSize)
+                    zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+                DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                         (U32)zcs->inBuffTarget, (U32)zcs->inBuffSize);
+                if (!lastBlock)
+                    assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                zcs->inToCompress = zcs->inBuffPos;
+                if (cDst == op) {  /* no need to flush */
+                    op += cSize;
+                    if (zcs->frameEnded) {
+                        DEBUGLOG(5, "Frame completed directly in outBuffer");
+                        someMoreWork = 0;
+                        ZSTD_CCtx_reset(zcs);
+                    }
+                    break;
+                }
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+            }
+	    /* fall-through */
+        case zcss_flush:
+            DEBUGLOG(5, "flush stage");
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, oend-op,
+                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+                            (U32)toFlush, (U32)(oend-op), (U32)flushed);
+                op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) {
+                    /* flush not fully completed, presumably because dst is too small */
+                    assert(op==oend);
+                    someMoreWork = 0;
+                    break;
+                }
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                if (zcs->frameEnded) {
+                    DEBUGLOG(5, "Frame completed on flush");
+                    someMoreWork = 0;
+                    ZSTD_CCtx_reset(zcs);
+                    break;
+                }
+                zcs->streamStage = zcss_load;
+                break;
+            }
+
+        default: /* impossible */
+            assert(0);
+        }
+    }
+
+    input->pos = ip - istart;
+    output->pos = op - ostart;
+    if (zcs->frameEnded) return 0;
+    {   size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+        if (hintInSize==0) hintInSize = zcs->blockSize;
+        return hintInSize;
+    }
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    /* check conditions */
+    if (output->pos > output->size) return ERROR(GENERIC);
+    if (input->pos  > input->size)  return ERROR(GENERIC);
+
+    return ZSTD_compressStream_generic(zcs, output, input, ZSTD_e_continue);
+}
+
+
+size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+                              ZSTD_EndDirective endOp)
+{
+    DEBUGLOG(5, "ZSTD_compress_generic, endOp=%u ", (U32)endOp);
+    /* check conditions */
+    if (output->pos > output->size) return ERROR(GENERIC);
+    if (input->pos  > input->size)  return ERROR(GENERIC);
+    assert(cctx!=NULL);
+
+    /* transparent initialization stage */
+    if (cctx->streamStage == zcss_init) {
+        ZSTD_CCtx_params params = cctx->requestedParams;
+        ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+        memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));  /* single usage */
+        assert(prefixDict.dict==NULL || cctx->cdict==NULL);   /* only one can be set */
+        DEBUGLOG(4, "ZSTD_compress_generic : transparent init stage");
+        if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1;  /* auto-fix pledgedSrcSize */
+        params.cParams = ZSTD_getCParamsFromCCtxParams(
+                &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
+
+
+#ifdef ZSTD_MULTITHREAD
+        if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+            params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+        }
+        if (params.nbWorkers > 0) {
+            /* mt context creation */
+            if (cctx->mtctx == NULL) {
+                DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbWorkers=%u",
+                            params.nbWorkers);
+                cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbWorkers, cctx->customMem);
+                if (cctx->mtctx == NULL) return ERROR(memory_allocation);
+            }
+            /* mt compression */
+            DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+            CHECK_F( ZSTDMT_initCStream_internal(
+                        cctx->mtctx,
+                        prefixDict.dict, prefixDict.dictSize, ZSTD_dct_rawContent,
+                        cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
+            cctx->streamStage = zcss_load;
+            cctx->appliedParams.nbWorkers = params.nbWorkers;
+        } else
+#endif
+        {   CHECK_F( ZSTD_resetCStream_internal(cctx,
+                            prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                            cctx->cdict,
+                            params, cctx->pledgedSrcSizePlusOne-1) );
+            assert(cctx->streamStage == zcss_load);
+            assert(cctx->appliedParams.nbWorkers == 0);
+    }   }
+
+    /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        if (cctx->cParamsChanged) {
+            ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+            cctx->cParamsChanged = 0;
+        }
+        {   size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+            if ( ZSTD_isError(flushMin)
+              || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+                ZSTD_CCtx_reset(cctx);
+            }
+            return flushMin;
+    }   }
+#endif
+    CHECK_F( ZSTD_compressStream_generic(cctx, output, input, endOp) );
+    DEBUGLOG(5, "completed ZSTD_compress_generic");
+    return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compress_generic_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_compress_generic(cctx, &output, &input, endOp);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    if (output->pos > output->size) return ERROR(GENERIC);
+    CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_flush) );
+    return zcs->outBuffContentSize - zcs->outBuffFlushedSize;  /* remaining to flush */
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    if (output->pos > output->size) return ERROR(GENERIC);
+    CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_end) );
+    {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+        size_t const checksumSize = zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4;
+        size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize + lastBlockSize + checksumSize;
+        DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (U32)toFlush);
+        return toFlush;
+    }
+}
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - guarantees a monotonically increasing memory budget */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 19, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 20, 16, 17,  1,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 20, 18, 18,  1,  5,  1, ZSTD_dfast   },  /* level  4 */
+    { 20, 18, 18,  2,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  2,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 18, 19,  3,  5,  8, ZSTD_lazy2   },  /* level  7 */
+    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 21, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
+    { 22, 21, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 22, 22,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 21, 22,  4,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 22, 22,  4,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3,256, ZSTD_btopt   },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra },  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra },  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra },  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra },  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  1, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  1, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 16, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  6,  4, 16, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 19, 19,  8,  4, 16, ZSTD_btlazy2 },  /* level 13 */
+    { 18, 18, 19,  4,  4, 24, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  4,  3, 24, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3, 64, ZSTD_btopt   },  /* level 16.*/
+    { 18, 19, 19,  8,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btultra },  /* level 19.*/
+    { 18, 19, 19, 11,  3,512, ZSTD_btultra },  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra },  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra },  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  1, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 11 */
+    { 17, 18, 17,  6,  4, 16, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  8,  4, 16, ZSTD_btlazy2 },  /* level 13.*/
+    { 17, 18, 17,  4,  4, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  7,  3,128, ZSTD_btopt   },  /* level 16.*/
+    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 17.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 19.*/
+    { 17, 18, 17,  9,  3,256, ZSTD_btultra },  /* level 20.*/
+    { 17, 18, 17, 10,  3,256, ZSTD_btultra },  /* level 21.*/
+    { 17, 18, 17, 11,  3,512, ZSTD_btultra },  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 14,  2,  4,  1, ZSTD_dfast   },  /* level  3.*/
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4.*/
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  6,  3, 16, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  6,  3, 24, ZSTD_btopt   },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 48, ZSTD_btopt   },  /* level 14.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 14, 15, 15,  6,  3, 96, ZSTD_btopt   },  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 14, 15, 15,  6,  3,256, ZSTD_btultra },  /* level 19.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra },  /* level 20.*/
+    { 14, 15, 15,  9,  3,256, ZSTD_btultra },  /* level 21.*/
+    { 14, 15, 15, 10,  3,512, ZSTD_btultra },  /* level 22.*/
+},
+};
+
+/*! ZSTD_getCParams() :
+*  @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+*   Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    size_t const addedSize = srcSizeHint ? 0 : 500;
+    U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : (U64)-1;
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
+    int row = compressionLevel;
+    DEBUGLOG(5, "ZSTD_getCParams (cLevel=%i)", compressionLevel);
+    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
+    if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
+    if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel);   /* acceleration factor */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); }
+
+}
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSizeHint, dictSize);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}
diff --git a/deps/SZ/zstd/compress/zstd_compress_internal.h b/deps/SZ/zstd/compress/zstd_compress_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..d31542c69b62d4dabe75aa3a3d0589a7b22281fe
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_compress_internal.h
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "zstd_internal.h"
+#ifdef ZSTD_MULTITHREAD
+#  include "zstdmt_compress.h"
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy
+                                       Constant required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef enum {
+    ZSTD_dictDefaultAttach = 0,
+    ZSTD_dictForceAttach = 1,
+    ZSTD_dictForceCopy = -1,
+} ZSTD_dictAttachPref_e;
+
+typedef struct ZSTD_prefixDict_s {
+    const void* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+    U32 CTable[HUF_CTABLE_SIZE_U32(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
+    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    FSE_repeat offcode_repeatMode;
+    FSE_repeat matchlength_repeatMode;
+    FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
+} ZSTD_entropyCTables_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+    int price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
+typedef struct {
+    /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+    U32* litFreq;                /* table of literals statistics, of size 256 */
+    U32* litLengthFreq;          /* table of litLength statistics, of size (MaxLL+1) */
+    U32* matchLengthFreq;        /* table of matchLength statistics, of size (MaxML+1) */
+    U32* offCodeFreq;            /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+
+    U32  litSum;                 /* nb of literals */
+    U32  litLengthSum;           /* nb of litLength codes */
+    U32  matchLengthSum;         /* nb of matchLength codes */
+    U32  offCodeSum;             /* nb of offset codes */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+} optState_t;
+
+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;    /* next block here to continue on current prefix */
+    BYTE const* base;       /* All regular indexes relative to this position */
+    BYTE const* dictBase;   /* extDict indexes relative to this position */
+    U32 dictLimit;          /* below that point, need extDict */
+    U32 lowLimit;           /* below that point, no more data */
+} ZSTD_window_t;
+
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 nextToUpdate3;      /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table : larger == faster, more memory */
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+    optState_t opt;         /* optimal parser state */
+    const ZSTD_matchState_t *dictMatchState;
+};
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+    U32 offset;
+    U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
+    ldmEntry_t* hashTable;
+    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
+    U64 hashPower;          /* Used to compute the rolling hash.
+                             * Depends on ldmParams.minMatchLength */
+} ldmState_t;
+
+typedef struct {
+    U32 enableLdm;          /* 1 if enable long distance matching */
+    U32 hashLog;            /* Log size of hashTable */
+    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+    U32 minMatchLength;     /* Minimum match length */
+    U32 hashEveryLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+    U32 offset;
+    U32 litLength;
+    U32 matchLength;
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;     /* The start of the sequences */
+  size_t pos;      /* The position where reading stopped. <= size. */
+  size_t size;     /* The number of sequences. <= capacity. */
+  size_t capacity; /* The capacity starting from `seq` pointer */
+} rawSeqStore_t;
+
+struct ZSTD_CCtx_params_s {
+    ZSTD_format_e format;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+
+    int compressionLevel;
+    int forceWindow;           /* force back-references to respect limit of
+                                * 1<<wLog, even for dictionary */
+
+    ZSTD_dictAttachPref_e attachDictPref;
+
+    /* Multithreading: used to pass parameters to mtctx */
+    unsigned nbWorkers;
+    unsigned jobSize;
+    unsigned overlapSizeLog;
+
+    /* Long distance matching parameters */
+    ldmParams_t ldmParams;
+
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+    ZSTD_customMem customMem;
+};  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+struct ZSTD_CCtx_s {
+    ZSTD_compressionStage_e stage;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+    ZSTD_CCtx_params requestedParams;
+    ZSTD_CCtx_params appliedParams;
+    U32   dictID;
+
+    int workSpaceOversizedDuration;
+    void* workSpace;
+    size_t workSpaceSize;
+    size_t blockSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+    size_t staticSize;
+
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+
+    /* streaming */
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage streamStage;
+    U32    frameEnded;
+
+    /* Dictionary */
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    ZSTD_prefixDict prefixDict;   /* single-usage dictionary */
+
+    /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_CCtx* mtctx;
+#endif
+};
+
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+
+typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e;
+
+
+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 16, 17, 17, 18, 18, 19, 19,
+                                      20, 20, 20, 20, 21, 21, 21, 21,
+                                      22, 22, 22, 22, 22, 22, 22, 22,
+                                      23, 23, 23, 23, 23, 23, 23, 23,
+                                      24, 24, 24, 24, 24, 24, 24, 24,
+                                      24, 24, 24, 24, 24, 24, 24, 24 };
+    static const U32 LL_deltaCode = 19;
+    return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ *        because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+    static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                      32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                      38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                      40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+    static const U32 ML_deltaCode = 36;
+    return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+/*! ZSTD_storeSeq() :
+ *  Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+ *  `offsetCode` : distance to match + 3 (values 1-3 are repCodes).
+ *  `mlBase` : matchLength - MINMATCH
+*/
+MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t mlBase)
+{
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
+    static const BYTE* g_start = NULL;
+    if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+    {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
+    }
+#endif
+    /* copy Literals */
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 1;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+    /* match Length */
+    if (mlBase>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 2;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].matchLength = (U16)mlBase;
+
+    seqStorePtr->sequences++;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+                                                     0, 3, 1, 3, 1, 4, 2, 7,
+                                                     0, 2, 3, 6, 1, 5, 3, 5,
+                                                     1, 3, 4, 4, 2, 5, 6, 7,
+                                                     7, 0, 1, 2, 3, 3, 4, 6,
+                                                     2, 6, 5, 5, 3, 4, 5, 6,
+                                                     7, 1, 2, 4, 6, 4, 4, 5,
+                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+                                                     3, 2, 2, 1, 3, 2, 0, 1,
+                                                     3, 3, 1, 2, 2, 2, 2, 0,
+                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    if (pIn < pInLoopLimit) {
+        { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+          if (diff) return ZSTD_NbCommonBytes(diff); }
+        pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+        while (pIn < pInLoopLimit) {
+            size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+            if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+            pIn += ZSTD_NbCommonBytes(diff);
+            return (size_t)(pIn - pStart);
+    }   }
+    if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ *  Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+/*-*************************************
+*  Round buffer management
+***************************************/
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            ZSTD_dictMatchState :
+            ZSTD_noDict;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  void const* srcEnd)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window.base);
+    return current > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ * NOTE: (maxDist & cycleMask) must be zero.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleMask = (1U << cycleLog) - 1;
+    U32 const current = (U32)((BYTE const*)src - window->base);
+    U32 const newCurrent = (current & cycleMask) + maxDist;
+    U32 const correction = current - newCurrent;
+    assert((maxDist & cycleMask) == 0);
+    assert(current > newCurrent);
+    /* Loose bound, should be around 1<<29 (see above) */
+    assert(correction > 1<<28);
+
+    window->base += correction;
+    window->dictBase += correction;
+    window->lowLimit -= correction;
+    window->dictLimit -= correction;
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
+ * This allows a simple check that index >= lowLimit to see if index is valid.
+ * This must be called before a block compression call, with srcEnd as the block
+ * source end.
+ *
+ * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit.
+ * This is because dictionaries are allowed to be referenced as long as the last
+ * byte of the dictionary is in the window, but once they are out of range,
+ * they cannot be referenced. If loadedDictEndPtr is NULL, we use
+ * loadedDictEnd == 0.
+ *
+ * In normal dict mode, the dict is between lowLimit and dictLimit. In
+ * dictMatchState mode, lowLimit and dictLimit are the same, and the dictionary
+ * is below them. forceWindow and dictMatchState are therefore incompatible.
+ */
+MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                                           void const* srcEnd, U32 maxDist,
+                                           U32* loadedDictEndPtr,
+                                           const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window->base);
+    U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: current=%u, maxDist=%u", current, maxDist);
+    if (current > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = current - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        window->dictLimit, window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        if (loadedDictEndPtr)
+            *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr)
+            *dictMatchStatePtr = NULL;
+    }
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        // ms->nextToUpdate = window->dictLimit;
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+    }
+    return contiguous;
+}
+
+
+/* debug functions */
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (stat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints,
+ * LDM and manually set compression parameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
+
+/*! ZSTD_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                     const ZSTD_CDict* cdict,
+                     ZSTD_CCtx_params  params, unsigned long long pledgedSrcSize);
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
+/*! ZSTD_compressStream_generic() :
+ *  Private use only. To be called from zstdmt_compress.c in single-thread mode. */
+size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                   ZSTD_outBuffer* output,
+                                   ZSTD_inBuffer* input,
+                                   ZSTD_EndDirective const flushMode);
+
+/*! ZSTD_getCParamsFromCDict() :
+ *  as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params,
+                                    unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict,size_t dictSize,
+                                 ZSTD_CCtx_params params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapcity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+
+#endif /* ZSTD_COMPRESS_H */
diff --git a/deps/SZ/zstd/compress/zstd_double_fast.c b/deps/SZ/zstd/compress/zstd_double_fast.c
new file mode 100644
index 0000000000000000000000000000000000000000..7fc11eb482a6d80736dd1b2ed9b8bc12191404a9
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_double_fast.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_double_fast.h"
+
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = current + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
+{
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
+                                     dms->chainTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixLowest + dictEnd - dictStart);
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        U32 offset;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
+
+        /* check dictMatchState repcode */
+        if (dictMode == ZSTD_dictMatchState
+            && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        /* check noDict repcode */
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        /* check prefix long match */
+        if ( (matchIndexL > prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
+            mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+            offset = (U32)(ip-matchLong);
+            while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+            goto _match_found;
+        }
+
+        /* check dictMatchState long match */
+        if (dictMode == ZSTD_dictMatchState) {
+            U32 const dictMatchIndexL = dictHashLong[h2];
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(current - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        /* check prefix short match */
+        if ( (matchIndexS > prefixLowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
+            goto _search_next_long;
+        }
+
+        /* check dictMatchState short match */
+        if (dictMode == ZSTD_dictMatchState) {
+            U32 const dictMatchIndexS = dictHashSmall[h];
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+        continue;
+
+_search_next_long:
+
+        {
+            size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = current + 1;
+
+            /* check prefix long +1 match */
+            if ( (matchIndexL3 > prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
+                mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                ip++;
+                offset = (U32)(ip-matchL3);
+                while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+
+            /* check dict long +1 match */
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const dictMatchIndexL3 = dictHashLong[hl3];
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            }
+        }
+
+        /* if no long +1 match, explore the short match we found */
+        if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(current - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+        /* fall-through */
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+
+            /* check immediate repcode */
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
+                        && repIndex2 < prefixLowestIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    const U32 mls = cParams->searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4, ZSTD_noDict);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5, ZSTD_noDict);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6, ZSTD_noDict);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    const U32 mls = cParams->searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
+
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (repIndex > dictStartIndex))
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+                offset = current - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = current + 1;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+                    ip++;
+                    offset = current+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                    offset = current - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2;
+            hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (repIndex2 > dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const mls = cParams->searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+    }
+}
diff --git a/deps/SZ/zstd/compress/zstd_double_fast.h b/deps/SZ/zstd/compress/zstd_double_fast.h
new file mode 100644
index 0000000000000000000000000000000000000000..c475021d29daefcd7e9f826aff28e06c90479cda
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_double_fast.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "mem.h"      /* U32 */
+#include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
diff --git a/deps/SZ/zstd/compress/zstd_fast.c b/deps/SZ/zstd/compress/zstd_fast.c
new file mode 100644
index 0000000000000000000000000000000000000000..37a715167c62732fcbf5f4215322bafe89c45ec1
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_fast.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_fast.h"
+
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls);
+            if (i == 0 || hashTable[hash] == 0)
+                hashTable[hash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+        }
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 stepSize, U32 const mls,
+        ZSTD_dictMode_e const dictMode)
+{
+    U32* const hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32* const dictHashTable = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixStartIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+    /* otherwise, we would get index underflow when translating a dict index
+     * into a local index */
+    assert(dictMode != ZSTD_dictMatchState
+        || prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+    /* init */
+    stepSize += !stepSize;  /* support stepSize of 0 */
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixStart);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndex = hashTable[h];
+        const BYTE* match = base + matchIndex;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixStartIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashTable[h] = current;   /* update hash table */
+
+        if ( (dictMode == ZSTD_dictMatchState)
+          && ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else if ( dictMode == ZSTD_noDict
+                 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else if ( (matchIndex <= prefixStartIndex)
+                 || (MEM_read32(match) != MEM_read32(ip)) ) {
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const dictMatchIndex = dictHashTable[h];
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex <= dictStartIndex ||
+                    MEM_read32(dictMatch) != MEM_read32(ip)) {
+                    assert(stepSize >= 1);
+                    ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                    continue;
+                } else {
+                    /* found a dict match */
+                    U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta);
+                    mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+                    while (((ip>anchor) & (dictMatch>dictStart))
+                         && (ip[-1] == dictMatch[-1])) {
+                        ip--; dictMatch--; mLength++;
+                    } /* catch up */
+                    offset_2 = offset_1;
+                    offset_1 = offset;
+                    ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                }
+            } else {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            }
+        } else {
+            /* found a regular match */
+            U32 const offset = (U32)(ip-match);
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            while (((ip>anchor) & (match>prefixStart))
+                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            offset_2 = offset_1;
+            offset_1 = offset;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+
+            /* check immediate repcode */
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
+    assert(ms->dictMatchState == NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4, ZSTD_noDict);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5, ZSTD_noDict);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6, ZSTD_noDict);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7, ZSTD_noDict);
+    }
+}
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 stepSize, U32 const mls)
+{
+    U32* hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    stepSize += !stepSize;   /* support stepSize == 0 */
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+        const U32    matchIndex = hashTable[h];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE*  match = matchBase + matchIndex;
+        const U32    current = (U32)(ip-base);
+        const U32    repIndex = current + 1 - offset_1;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashTable[h] = current;   /* update hash table */
+        assert(offset_1 <= current +1);   /* check repIndex */
+
+        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ( (matchIndex < dictStartIndex) ||
+                 (MEM_read32(match) != MEM_read32(ip)) ) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            }
+            {   const BYTE* matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                offset = current - matchIndex;
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+    }
+}
diff --git a/deps/SZ/zstd/compress/zstd_fast.h b/deps/SZ/zstd/compress/zstd_fast.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e7435f8c60fabee248bf6b685d0f36e8077e0e3
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_fast.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "mem.h"      /* U32 */
+#include "zstd_compress_internal.h"
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
diff --git a/deps/SZ/zstd/compress/zstd_lazy.c b/deps/SZ/zstd/compress/zstd_lazy.c
new file mode 100644
index 0000000000000000000000000000000000000000..bfe944928202807aec1fbefc8a1493d1ecb8d097
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_lazy.c
@@ -0,0 +1,1090 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_lazy.h"
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+
+void ZSTD_updateDUBT(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
+{
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : current >= btlow == (current - btmask)
+ *  doesn't fail */
+static void ZSTD_insertDUBT1(
+                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                 U32 current, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode)
+{
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
+    const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                current, dictLimit, windowLow);
+    assert(current >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+        if ( (dictMode != ZSTD_extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (current < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (current < dictLimit) );
+            match = mBase + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    current, matchIndex, (U32)matchLength);
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+}
+
+
+static size_t ZSTD_DUBT_findBetterDictMatch (
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        size_t bestLength,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode) {
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = cParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const current = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = cParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    U32 matchEndIdx = current+8+1;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(9, "ZSTD_DUBT_findBestDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+            }
+            if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+        DEBUGLOG(8, "ZSTD_DUBT_findBestDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static size_t ZSTD_DUBT_findBestMatch (
+                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                            const BYTE* const ip, const BYTE* const iend,
+                            size_t* offsetPtr,
+                            U32 const mls,
+                            const ZSTD_dictMode_e dictMode)
+{
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const current = (U32)(ip-base);
+    U32    const windowLow = ms->window.lowLimit;
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32    const btLow = (btMask >= current) ? 0 : current - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);
+
+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
+    assert(ip <= iend-8);   /* required for h calculation */
+
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }
+
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;   /* nullify next candidate if it's still unsorted (note : simplification, detrimental to compression ratio, beneficial for speed) */
+    }
+
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller=0, commonLengthLarger=0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(current&btMask);
+        U32* largerPtr  = bt + 2*(current&btMask) + 1;
+        U32 matchEndIdx = current+8+1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = current;   /* Update Hash Table */
+
+        while (nbCompares-- && (matchIndex > windowLow)) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(ms, cParams, ip, iend, offsetPtr, bestLength, nbCompares, mls, dictMode);
+        }
+
+        assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+        }
+        return bestLength;
+    }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+FORCE_INLINE_TEMPLATE size_t ZSTD_BtFindBestMatch (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls /* template */,
+                        const ZSTD_dictMode_e dictMode)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, dictMode);
+}
+
+
+static size_t ZSTD_BtFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    }
+}
+
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & mask]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip)
+{
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+}
+
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_HcFindBestMatch_generic (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 lowLimit = ms->window.lowLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 minChain = current > chainSize ? current - chainSize : 0;
+    U32 nbAttempts = 1U << cParams->searchLog;
+    size_t ml=4-1;
+
+    /* HC4 match finder */
+    U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+
+    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        size_t currentMl=0;
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+            const BYTE* const match = base + matchIndex;
+            if (match[ml] == ip[ml])   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            const BYTE* const match = dictBase + matchIndex;
+            assert(match+4 <= dictEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    if (dictMode == ZSTD_dictMatchState) {
+        const ZSTD_matchState_t* const dms = ms->dictMatchState;
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > chainSize ? dmsSize - chainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, cParams->hashLog, mls)];
+
+        for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+            matchIndex = dmsChainTable[matchIndex & chainMask];
+        }
+    }
+
+    return ml;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    }
+}
+
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
+        (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
+        (searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS);
+    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictLowest   = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
+
+    (void)dictMode;
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
+        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+    }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+
+        /* check repCode */
+        if (dictMode == ZSTD_dictMatchState) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+        if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            if ( (dictMode == ZSTD_noDict)
+              && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                if ((mlRep >= 4) && (gain2 > gain1))
+                    matchLength = mlRep, offset = 0, start = ip;
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+            }
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                if ( (dictMode == ZSTD_noDict)
+                  && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+                if (dictMode == ZSTD_dictMatchState) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offset = 0, start = ip;
+                    }
+                }
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* NOTE:
+         * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+         * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+         * overflows the pointer, which is undefined behavior.
+         */
+        /* catch up */
+        if (offset) {
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
+                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        if (dictMode == ZSTD_dictMatchState) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = dictMode == ZSTD_dictMatchState
+                    && repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1 ? offset_1 : savedOffset;
+    rep[1] = offset_2 ? offset_2 : savedOffset;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0, ZSTD_dictMatchState);
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const U32 lowestIndex = ms->window.lowLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + lowestIndex;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+
+    /* init */
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    ip += (ip == prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+        U32 current = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 repIndex = (U32)(current+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))   /* intentional overflow */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+         if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            current++;
+            /* check repCode */
+            if (offset) {
+                const U32 repIndex = (U32)(current - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((repLength >= 4) && (gain2 > gain1))
+                        matchLength = repLength, offset = 0, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                current++;
+                /* check repCode */
+                if (offset) {
+                    const U32 repIndex = (U32)(current - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
+                            matchLength = repLength, offset = 0, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (offset) {
+            U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repIndex = (U32)((ip-base) - offset_2);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+}
+
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+}
diff --git a/deps/SZ/zstd/compress/zstd_lazy.h b/deps/SZ/zstd/compress/zstd_lazy.h
new file mode 100644
index 0000000000000000000000000000000000000000..c299de6dcabe191def69731ffa24f66fb0f9c35a
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_lazy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+U32 ZSTD_insertAndFindFirstIndex(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
diff --git a/deps/SZ/zstd/compress/zstd_ldm.c b/deps/SZ/zstd/compress/zstd_ldm.c
new file mode 100644
index 0000000000000000000000000000000000000000..215f55cf451d130aed3e58147bdc96880ba81caf
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_ldm.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+
+#include "zstd_ldm.h"
+
+#include "debug.h"
+#include "zstd_fast.h"          /* ZSTD_fillHashTable() */
+#include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+#define LDM_HASH_CHAR_OFFSET 10
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    params->windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (cParams->strategy >= ZSTD_btopt) {
+      /* Get out of the way of the optimal parser */
+      U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
+      assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
+      assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
+      params->minMatchLength = minMatch;
+    }
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashEveryLog == 0) {
+        params->hashEveryLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize =
+        ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ldmBucketSize + ldmHSize * sizeof(ldmEntry_t);
+    return params.enableLdm ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getSmallHash() :
+ *  numBits should be <= 32
+ *  If numBits==0, returns 0.
+ *  @return : the most significant numBits of value. */
+static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits)
+{
+    assert(numBits <= 32);
+    return numBits == 0 ? 0 : (U32)(value >> (64 - numBits));
+}
+
+/** ZSTD_ldm_getChecksum() :
+ *  numBitsToDiscard should be <= 32
+ *  @return : the next most significant 32 bits after numBitsToDiscard */
+static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard)
+{
+    assert(numBitsToDiscard <= 32);
+    return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF;
+}
+
+/** ZSTD_ldm_getTag() ;
+ *  Given the hash, returns the most significant numTagBits bits
+ *  after (32 + hbits) bits.
+ *
+ *  If there are not enough bits remaining, return the last
+ *  numTagBits bits. */
+static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits)
+{
+    assert(numTagBits < 32 && hbits <= 32);
+    if (32 - hbits < numTagBits) {
+        return hash & (((U32)1 << numTagBits) - 1);
+    } else {
+        return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1);
+    }
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const bucketOffsets = ldmState->bucketOffsets;
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry;
+    bucketOffsets[hash]++;
+    bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1;
+}
+
+/** ZSTD_ldm_makeEntryAndInsertByTag() :
+ *
+ *  Gets the small hash, checksum, and tag from the rollingHash.
+ *
+ *  If the tag matches (1 << ldmParams.hashEveryLog)-1, then
+ *  creates an ldmEntry from the offset, and inserts it into the hash table.
+ *
+ *  hBits is the length of the small hash, which is the most significant hBits
+ *  of rollingHash. The checksum is the next 32 most significant bits, followed
+ *  by ldmParams.hashEveryLog bits that make up the tag. */
+static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState,
+                                             U64 const rollingHash,
+                                             U32 const hBits,
+                                             U32 const offset,
+                                             ldmParams_t const ldmParams)
+{
+    U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog);
+    U32 const tagMask = ((U32)1 << ldmParams.hashEveryLog) - 1;
+    if (tag == tagMask) {
+        U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits);
+        U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+        ldmEntry_t entry;
+        entry.offset = offset;
+        entry.checksum = checksum;
+        ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams);
+    }
+}
+
+/** ZSTD_ldm_getRollingHash() :
+ *  Get a 64-bit hash using the first len bytes from buf.
+ *
+ *  Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be
+ *  H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0)
+ *
+ *  where the constant a is defined to be prime8bytes.
+ *
+ *  The implementation adds an offset to each byte, so
+ *  H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ... */
+static U64 ZSTD_ldm_getRollingHash(const BYTE* buf, U32 len)
+{
+    U64 ret = 0;
+    U32 i;
+    for (i = 0; i < len; i++) {
+        ret *= prime8bytes;
+        ret += buf[i] + LDM_HASH_CHAR_OFFSET;
+    }
+    return ret;
+}
+
+/** ZSTD_ldm_ipow() :
+ *  Return base^exp. */
+static U64 ZSTD_ldm_ipow(U64 base, U64 exp)
+{
+    U64 ret = 1;
+    while (exp) {
+        if (exp & 1) { ret *= base; }
+        exp >>= 1;
+        base *= base;
+    }
+    return ret;
+}
+
+U64 ZSTD_ldm_getHashPower(U32 minMatchLength) {
+    DEBUGLOG(4, "ZSTD_ldm_getHashPower: mml=%u", minMatchLength);
+    assert(minMatchLength >= ZSTD_LDM_MINMATCH_MIN);
+    return ZSTD_ldm_ipow(prime8bytes, minMatchLength - 1);
+}
+
+/** ZSTD_ldm_updateHash() :
+ *  Updates hash by removing toRemove and adding toAdd. */
+static U64 ZSTD_ldm_updateHash(U64 hash, BYTE toRemove, BYTE toAdd, U64 hashPower)
+{
+    hash -= ((toRemove + LDM_HASH_CHAR_OFFSET) * hashPower);
+    hash *= prime8bytes;
+    hash += toAdd + LDM_HASH_CHAR_OFFSET;
+    return hash;
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      ZSTD_compressionParameters const* cParams,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(cParams->strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, cParams, iend, ZSTD_dtlm_fast);
+        break;
+
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, cParams, iend, ZSTD_dtlm_fast);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+/** ZSTD_ldm_fillLdmHashTable() :
+ *
+ *  Fills hashTable from (lastHashed + 1) to iend (non-inclusive).
+ *  lastHash is the rolling hash that corresponds to lastHashed.
+ *
+ *  Returns the rolling hash corresponding to position iend-1. */
+static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
+                                     U64 lastHash, const BYTE* lastHashed,
+                                     const BYTE* iend, const BYTE* base,
+                                     U32 hBits, ldmParams_t const ldmParams)
+{
+    U64 rollingHash = lastHash;
+    const BYTE* cur = lastHashed + 1;
+
+    while (cur < iend) {
+        rollingHash = ZSTD_ldm_updateHash(rollingHash, cur[-1],
+                                          cur[ldmParams.minMatchLength-1],
+                                          state->hashPower);
+        ZSTD_ldm_makeEntryAndInsertByTag(state,
+                                         rollingHash, hBits,
+                                         (U32)(cur - base), ldmParams);
+        ++cur;
+    }
+    return rollingHash;
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const current = (U32)(anchor - ms->window.base);
+    if (current > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            current - MIN(512, current - ms->nextToUpdate - 1024);
+    }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U64 const hashPower = ldmState->hashPower;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    U32 const ldmBucketSize = 1U << params->bucketSizeLog;
+    U32 const hashEveryLog = params->hashEveryLog;
+    U32 const ldmTagMask = (1U << params->hashEveryLog) - 1;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash */
+    BYTE const* lastHashed = NULL;
+    U64 rollingHash = 0;
+
+    while (ip <= ilimit) {
+        size_t mLength;
+        U32 const current = (U32)(ip - base);
+        size_t forwardMatchLength = 0, backwardMatchLength = 0;
+        ldmEntry_t* bestEntry = NULL;
+        if (ip != istart) {
+            rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[0],
+                                              lastHashed[minMatchLength],
+                                              hashPower);
+        } else {
+            rollingHash = ZSTD_ldm_getRollingHash(ip, minMatchLength);
+        }
+        lastHashed = ip;
+
+        /* Do not insert and do not look for a match */
+        if (ZSTD_ldm_getTag(rollingHash, hBits, hashEveryLog) != ldmTagMask) {
+           ip++;
+           continue;
+        }
+
+        /* Get the best entry and compute the match lengths */
+        {
+            ldmEntry_t* const bucket =
+                ZSTD_ldm_getBucket(ldmState,
+                                   ZSTD_ldm_getSmallHash(rollingHash, hBits),
+                                   *params);
+            ldmEntry_t* cur;
+            size_t bestMatchLength = 0;
+            U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+
+            for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+
+                    curForwardMatchLength = ZSTD_count_2segments(
+                                                ip, pMatch, iend,
+                                                matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowMatchPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowPrefixPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                }
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+        }
+
+        /* No match found -- continue searching */
+        if (bestEntry == NULL) {
+            ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
+                                             hBits, current,
+                                             *params);
+            ip++;
+            continue;
+        }
+
+        /* Match found */
+        mLength = forwardMatchLength + backwardMatchLength;
+        ip -= backwardMatchLength;
+
+        {
+            /* Store the sequence:
+             * ip = current - backwardMatchLength
+             * The match is at (bestEntry->offset - backwardMatchLength)
+             */
+            U32 const matchIndex = bestEntry->offset;
+            U32 const offset = current - matchIndex;
+            rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+            /* Out of sequence storage */
+            if (rawSeqStore->size == rawSeqStore->capacity)
+                return ERROR(dstSize_tooSmall);
+            seq->litLength = (U32)(ip - anchor);
+            seq->matchLength = (U32)mLength;
+            seq->offset = offset;
+            rawSeqStore->size++;
+        }
+
+        /* Insert the current entry into the hash table */
+        ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
+                                         (U32)(lastHashed - base),
+                                         *params);
+
+        assert(ip + backwardMatchLength == lastHashed);
+
+        /* Fill the hash table from lastHashed+1 to ip+mLength*/
+        /* Heuristic: don't need to fill the entire table at end of block */
+        if (ip + mLength <= ilimit) {
+            rollingHash = ZSTD_ldm_fillLdmHashTable(
+                              ldmState, rollingHash, lastHashed,
+                              ip + mLength, base, hBits, *params);
+            lastHashed = ip + mLength - 1;
+        }
+        ip += mLength;
+        anchor = ip;
+    }
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximmum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, src);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         the offset against maxDist directly.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    unsigned const minMatch = cParams->searchLength;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the lits */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(sequence.offset <= (1U << cParams->windowLog));
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        /* Run the block compressor */
+        DEBUGLOG(5, "calling block compressor on segment of size %u", sequence.litLength);
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, cParams, ip,
+                                sequence.litLength);
+            ip += sequence.litLength;
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength,
+                          sequence.offset + ZSTD_REP_MOVE,
+                          sequence.matchLength - MINMATCH);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    /* Compress the last literals */
+    return blockCompressor(ms, seqStore, rep, cParams,
+                           ip, iend - ip);
+}
diff --git a/deps/SZ/zstd/compress/zstd_ldm.h b/deps/SZ/zstd/compress/zstd_ldm.h
new file mode 100644
index 0000000000000000000000000000000000000000..96588adb0680e012d33e1491836fa8572f1f8403
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_ldm.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+#include "zstd.h"   /* ZSTD_CCtx, size_t */
+
+/*-*************************************
+*  Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_DEFAULTMAX
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_compressionParameters const* cParams,
+            void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);
+
+
+/** ZSTD_ldm_getTableSize() :
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_getTableSize() :
+ *  Return prime8bytes^(minMatchLength-1) */
+U64 ZSTD_ldm_getHashPower(U32 minMatchLength);
+
+/** ZSTD_ldm_adjustParameters() :
+ *  If the params->hashEveryLog is not set, set it to its default value based on
+ *  windowLog and params->hashLog.
+ *
+ *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
diff --git a/deps/SZ/zstd/compress/zstd_opt.c b/deps/SZ/zstd/compress/zstd_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..476cdc148920d355a7869a3929be0fc3297297a6
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_opt.c
@@ -0,0 +1,1126 @@
+/*
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "hist.h"
+#include "zstd_opt.h"
+
+
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+#define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */
+#define ZSTD_MAX_PRICE     (1<<30)
+
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+
+#if 0    /* approximation at bit level */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+/* debugging function, @return price in bytes */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+static U32 ZSTD_downscaleStat(U32* table, U32 lastEltIndex, int malus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+        sum += table[s];
+    }
+    return sum;
+}
+
+static void ZSTD_rescaleFreqs(optState_t* const optPtr,
+                              const BYTE* const src, size_t const srcSize,
+                              int optLevel)
+{
+    optPtr->priceType = zop_dynamic;
+
+    if (optPtr->litLengthSum == 0) {  /* first block : init */
+        if (srcSize <= 1024)   /* heuristic */
+            optPtr->priceType = zop_predef;
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { /* huffman table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
+
+            assert(optPtr->litFreq != NULL);
+            optPtr->litSum = 0;
+            {   unsigned lit;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
+
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            {   unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+            }
+            optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+
+            {   unsigned ll;
+                for (ll=0; ll<=MaxLL; ll++)
+                    optPtr->litLengthFreq[ll] = 1;
+            }
+            optPtr->litLengthSum = MaxLL+1;
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned of;
+                for (of=0; of<=MaxOff; of++)
+                    optPtr->offCodeFreq[of] = 1;
+            }
+            optPtr->offCodeSum = MaxOff+1;
+
+        }
+
+    } else {   /* new block : re-use previous statistics, scaled down */
+
+        optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+        optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+        optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+        optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+    }
+
+    ZSTD_setBasePrices(optPtr, optLevel);
+}
+
+/* ZSTD_rawLiteralsCost() :
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                const optState_t* const optPtr,
+                                int optLevel)
+{
+    if (litLength == 0) return 0;
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+
+    /* dynamic statistics */
+    {   U32 price = litLength * optPtr->litSumBasePrice;
+        U32 u;
+        for (u=0; u < litLength; u++) {
+            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+        }
+        return price;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+    if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER) + (optPtr->litLengthSumBasePrice - WEIGHT(optPtr->litLengthFreq[llCode], optLevel));
+    }
+}
+
+/* ZSTD_litLengthContribution() :
+ * @return ( cost(litlength) - cost(0) )
+ * this value can then be added to rawLiteralsCost()
+ * to provide a cost which is directly comparable to a match ending at same position */
+static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+    if (optPtr->priceType >= zop_predef) return WEIGHT(litLength, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        int const contribution = (LL_bits[llCode] * BITCOST_MULTIPLIER)
+                               + WEIGHT(optPtr->litLengthFreq[0], optLevel)   /* note: log2litLengthSum cancel out */
+                               - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
+#if 1
+        return contribution;
+#else
+        return MAX(0, contribution); /* sometimes better, sometimes not ... */
+#endif
+    }
+}
+
+/* ZSTD_literalsContribution() :
+ * creates a fake cost for the literals part of a sequence
+ * which can be compared to the ending cost of a match
+ * should a new match start at this position */
+static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength,
+                                     const optState_t* const optPtr,
+                                     int optLevel)
+{
+    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr, optLevel)
+                           + ZSTD_litLengthContribution(litLength, optPtr, optLevel);
+    return contribution;
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offset,
+                   U32 const matchLength,
+                   const optState_t* const optPtr,
+                   int const optLevel)
+{
+    U32 price;
+    U32 const offCode = ZSTD_highbit32(offset+1);
+    U32 const mlBase = matchLength - MINMATCH;
+    assert(matchLength >= MINMATCH);
+
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
+
+    /* match Length */
+    {   U32 const mlCode = ZSTD_MLcode(mlBase);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
+    }
+
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
+    DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+    return price;
+}
+
+/* ZSTD_updateStats() :
+ * assumption : literals + litLengtn <= iend */
+static void ZSTD_updateStats(optState_t* const optPtr,
+                             U32 litLength, const BYTE* literals,
+                             U32 offsetCode, U32 matchLength)
+{
+    /* literals */
+    {   U32 u;
+        for (u=0; u < litLength; u++)
+            optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+        optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    }
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        optPtr->litLengthFreq[llCode]++;
+        optPtr->litLengthSum++;
+    }
+
+    /* match offset code (0-2=>repCode; 3+=>offset+2) */
+    {   U32 const offCode = ZSTD_highbit32(offsetCode+1);
+        assert(offCode <= MaxOff);
+        optPtr->offCodeFreq[offCode]++;
+        optPtr->offCodeSum++;
+    }
+
+    /* match Length */
+    {   U32 const mlBase = matchLength - MINMATCH;
+        U32 const mlCode = ZSTD_MLcode(mlBase);
+        optPtr->matchLengthFreq[mlCode]++;
+        optPtr->matchLengthSum++;
+    }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, const BYTE* const ip)
+{
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = ms->nextToUpdate3;
+    U32 const target = ms->nextToUpdate3 = (U32)(ip - base);
+    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ *  ip : assumed <= iend-8 .
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const mls, const int extDict)
+{
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
+    U32 matchEndIdx = current+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
+
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex >= matchLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+    assert(matchEndIdx > current + 8);
+    return matchEndIdx - (current + 8);
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(5, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
+
+    while(idx < target)
+        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, dictMode == ZSTD_extDict);
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend)
+{
+    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, ZSTD_noDict);
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_insertBtAndGetAllMatches (
+                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+                    U32 rep[ZSTD_REP_NUM], U32 const ll0,
+                    ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
+{
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
+    U32 const current = (U32)(ip-base);
+    U32 const hashLog = cParams->hashLog;
+    U32 const minMatch = (mls==3) ? 3 : 4;
+    U32* const hashTable = ms->hashTable;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
+    U32 const btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    U32 const btLow = btMask >= current ? 0 : current - btMask;
+    U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;
+
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && btMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - btMask : dmsLowLimit;
+
+    size_t bestLength = lengthToBeat-1;
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
+
+    /* check repCode */
+    {   U32 const lastR = ZSTD_REP_NUM + ll0;
+        U32 repCode;
+        for (repCode = ll0; repCode < lastR; repCode++) {
+            U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            U32 const repIndex = current - repOffset;
+            U32 repLen = 0;
+            assert(current >= dictLimit);
+            if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) {  /* equivalent to `current > repIndex >= dictLimit` */
+                if (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch)) {
+                    repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+                }
+            } else {  /* repIndex < dictLimit || repIndex >= current */
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
+                assert(current >= windowLow);
+                if ( dictMode == ZSTD_extDict
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow)  /* equivalent to `current > repIndex >= windowLow` */
+                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `current > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+            }   }
+            /* save longer solution */
+            if (repLen > bestLength) {
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
+                bestLength = repLen;
+                matches[mnum].off = repCode - ll0;
+                matches[mnum].len = (U32)repLen;
+                mnum++;
+                if ( (repLen > sufficient_len)
+                   | (ip+repLen == iLimit) ) {  /* best possible */
+                    return mnum;
+    }   }   }   }
+
+    /* HC3 match finder */
+    if ((mls == 3) /*static*/ && (bestLength < mls)) {
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip);
+        if ((matchIndex3 >= matchLow)
+          & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+            size_t mlen;
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
+                const BYTE* const match = base + matchIndex3;
+                mlen = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex3;
+                mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+            }
+
+            /* save best solution */
+            if (mlen >= mls /* == 3 > bestLength */) {
+                DEBUGLOG(8, "found small match with hlog3, of length %u",
+                            (U32)mlen);
+                bestLength = mlen;
+                assert(current > matchIndex3);
+                assert(mnum==0);  /* no prior solution */
+                matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
+                matches[0].len = (U32)mlen;
+                mnum = 1;
+                if ( (mlen > sufficient_len) |
+                     (ip+mlen == iLimit) ) {  /* best possible length */
+                    ms->nextToUpdate = current+1;  /* skip insertion */
+                    return 1;
+                }
+            }
+        }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex >= matchLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match;
+        assert(current > matchIndex);
+
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* prepare for match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+                    (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+            assert(matchEndIdx > matchIndex);
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new candidate => larger than match, which was smaller than current */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous, closer to current */
+        } else {
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        U32 dictMatchIndex = dms->hashTable[h];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+                        (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+            }
+        }
+    }
+
+    assert(matchEndIdx > current+8);
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    return mnum;
+}
+
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
+                        U32 rep[ZSTD_REP_NUM], U32 const ll0,
+                        ZSTD_match_t* matches, U32 const lengthToBeat)
+{
+    U32 const matchLengthSearch = cParams->searchLength;
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, dictMode);
+    switch(matchLengthSearch)
+    {
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 3);
+    default :
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 5);
+    case 7 :
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 6);
+    }
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+{
+    repcodes_t newReps;
+    if (offset >= ZSTD_REP_NUM) {  /* full offset */
+        newReps.rep[2] = rep[1];
+        newReps.rep[1] = rep[0];
+        newReps.rep[0] = offset - ZSTD_REP_MOVE;
+    } else {   /* repcode */
+        U32 const repCode = offset + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            newReps.rep[1] = rep[0];
+            newReps.rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            memcpy(&newReps, rep, sizeof(newReps));
+        }
+    }
+    return newReps;
+}
+
+
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+{
+    return sol.litlen + sol.mlen;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                               const ZSTD_compressionParameters* cParams,
+                               const void* src, size_t srcSize,
+                               const int optLevel, const ZSTD_dictMode_e dictMode)
+{
+    optState_t* const optStatePtr = &ms->opt;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;
+
+    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+    ZSTD_match_t* const matches = optStatePtr->matchTable;
+    ZSTD_optimal_t lastSequence;
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic");
+    assert(optLevel <= 2);
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
+    ip += (ip==prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, last_pos = 0;
+
+        /* find first match */
+        {   U32 const litlen = (U32)(ip - anchor);
+            U32 const ll0 = !litlen;
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, dictMode, rep, ll0, matches, minMatch);
+            if (!nbMatches) { ip++; continue; }
+
+            /* initialize opt[0] */
+            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+            opt[0].mlen = 0;  /* means is_a_literal */
+            opt[0].litlen = litlen;
+            opt[0].price = ZSTD_literalsContribution(anchor, litlen, optStatePtr, optLevel);
+
+            /* large match -> immediate encoding */
+            {   U32 const maxML = matches[nbMatches-1].len;
+                U32 const maxOffset = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new serie",
+                            nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
+
+                if (maxML > sufficient_len) {
+                    lastSequence.litlen = litlen;
+                    lastSequence.mlen = maxML;
+                    lastSequence.off = maxOffset;
+                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                maxML, sufficient_len);
+                    cur = 0;
+                    last_pos = ZSTD_totalLen(lastSequence);
+                    goto _shortestPath;
+            }   }
+
+            /* set prices for first matches starting position == 0 */
+            {   U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 pos;
+                U32 matchNb;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                }
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const end = matches[matchNb].len;
+                    repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0);
+                    for ( ; pos <= end ; pos++ ) {
+                        U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
+                        opt[pos].mlen = pos;
+                        opt[pos].off = offset;
+                        opt[pos].litlen = litlen;
+                        opt[pos].price = sequencePrice;
+                        ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
+                        memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
+                }   }
+                last_pos = pos-1;
+            }
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            const BYTE* const inr = ip + cur;
+            assert(cur < ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+
+            /* Fix current position with one literal if cheaper */
+            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+                int const price = opt[cur-1].price
+                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+                                + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+                                - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+                assert(price < 1000000000); /* overflow check */
+                if (price <= opt[cur].price) {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur].mlen = 0;
+                    opt[cur].off = 0;
+                    opt[cur].litlen = litlen;
+                    opt[cur].price = price;
+                    memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep));
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                }
+            }
+
+            /* last match must start at a minimum distance of 8 from oend */
+            if (inr > ilimit) continue;
+
+            if (cur == last_pos) break;
+
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+                continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
+
+            {   U32 const ll0 = (opt[cur].mlen != 0);
+                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+                U32 const previousPrice = opt[cur].price;
+                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, dictMode, opt[cur].rep, ll0, matches, minMatch);
+                U32 matchNb;
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
+
+                {   U32 const maxML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+                                inr-istart, cur, nbMatches, maxML);
+
+                    if ( (maxML > sufficient_len)
+                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        lastSequence.mlen = maxML;
+                        lastSequence.off = matches[nbMatches-1].off;
+                        lastSequence.litlen = litlen;
+                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+                        last_pos = cur + ZSTD_totalLen(lastSequence);
+                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
+                        goto _shortestPath;
+                }   }
+
+                /* set prices using matches found at position == cur */
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0);
+                    U32 const lastML = matches[matchNb].len;
+                    U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                    U32 mlen;
+
+                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+                                matchNb, matches[matchNb].off, lastML, litlen);
+
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                        U32 const pos = cur + mlen;
+                        int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+                        if ((pos > last_pos) || (price < opt[pos].price)) {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
+                            opt[pos].mlen = mlen;
+                            opt[pos].off = offset;
+                            opt[pos].litlen = litlen;
+                            opt[pos].price = price;
+                            ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
+                            memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
+                        } else {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                        }
+            }   }   }
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
+
+        lastSequence = opt[last_pos];
+        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
+
+_shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+        assert(opt[0].mlen == 0);
+
+        {   U32 const storeEnd = cur + 1;
+            U32 storeStart = storeEnd;
+            U32 seqPos = cur;
+
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur);
+            assert(storeEnd < ZSTD_OPT_NUM);
+            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+            opt[storeEnd] = lastSequence;
+            while (seqPos > 0) {
+                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+                storeStart--;
+                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+                opt[storeStart] = opt[seqPos];
+                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+            }
+
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore")
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offCode = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, llen, mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
+                    }
+
+                    /* repcodes update : like ZSTD_updateRep(), but update in place */
+                    if (offCode >= ZSTD_REP_NUM) {  /* full offset */
+                        rep[2] = rep[1];
+                        rep[1] = rep[0];
+                        rep[0] = offCode - ZSTD_REP_MOVE;
+                    } else {   /* repcode */
+                        U32 const repCode = offCode + (llen==0);
+                        if (repCode) {  /* note : if repCode==0, no change */
+                            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+                            if (repCode >= 2) rep[2] = rep[1];
+                            rep[1] = rep[0];
+                            rep[0] = currentOffset;
+                    }   }
+
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, offCode, mlen-MINMATCH);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+
+    }   /* while (ip < ilimit) */
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+}
+
+
+/* used in 2-pass strategy */
+static U32 ZSTD_upscaleStat(U32* table, U32 lastEltIndex, int bonus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+bonus > 0);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] <<= ZSTD_FREQ_DIV+bonus;
+        table[s]--;
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* used in 2-pass strategy */
+MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
+{
+    optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
+    optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 1);
+    optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 1);
+    optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 1);
+}
+
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+#if 0
+    /* 2-pass strategy (disabled)
+     * this strategy makes a first pass over first block to collect statistics
+     * and seed next round's statistics with it.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)   /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit) ) {   /* no dictionary */
+        U32 tmpRep[ZSTD_REP_NUM];
+        DEBUGLOG(5, "ZSTD_compressBlock_btultra: first block: collecting statistics");
+        assert(ms->nextToUpdate >= ms->window.dictLimit
+            && ms->nextToUpdate <= ms->window.dictLimit + 1);
+        memcpy(tmpRep, rep, sizeof(tmpRep));
+        ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+        ZSTD_resetSeqStore(seqStore);
+        /* invalidate first scan from history */
+        ms->window.base -= srcSize;
+        ms->window.dictLimit += (U32)srcSize;
+        ms->window.lowLimit = ms->window.dictLimit;
+        ms->nextToUpdate = ms->window.dictLimit;
+        ms->nextToUpdate3 = ms->window.dictLimit;
+        /* re-inforce weight of collected statistics */
+        ZSTD_upscaleStats(&ms->opt);
+    }
+#endif
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const ZSTD_compressionParameters* cParams, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
+}
diff --git a/deps/SZ/zstd/compress/zstd_opt.h b/deps/SZ/zstd/compress/zstd_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..63dbe79a846d42e511e01847e6e30bae866f44cf
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstd_opt.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+void ZSTD_updateTree(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
diff --git a/deps/SZ/zstd/compress/zstdmt_compress.c b/deps/SZ/zstd/compress/zstdmt_compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..6daedca8b3d0a57af2ad48e5fa7934a521a92e6d
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstdmt_compress.c
@@ -0,0 +1,1906 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Tuning parameters   ====== */
+#define ZSTDMT_NBWORKERS_MAX 200
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (2 GB))  /* note : limited by `jobSize` type, which is `unsigned` */
+#define ZSTDMT_OVERLAPLOG_DEFAULT 6
+
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)   /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+/* ======   Dependencies   ====== */
+#include <string.h>      /* memcpy, memset */
+#include <limits.h>      /* INT_MAX */
+#include "pool.h"        /* threadpool */
+#include "threading.h"   /* mutex */
+#include "zstd_compress_internal.h"  /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
+#include "zstd_ldm.h"
+#include "zstdmt_compress.h"
+
+/* Guards code to support resizing the SeqPool.
+ * We will want to resize the SeqPool to save memory in the future.
+ * Until then, comment the code out since it is unused.
+ */
+#define ZSTD_RESIZE_SEQPOOL 0
+
+/* ======   Debug   ====== */
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) && !defined(_MSC_VER)
+
+#  include <stdio.h>
+#  include <unistd.h>
+#  include <sys/times.h>
+
+#  define DEBUG_PRINTHEX(l,p,n) {            \
+    unsigned debug_u;                        \
+    for (debug_u=0; debug_u<(n); debug_u++)  \
+        RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    RAWLOG(l, " \n");                        \
+}
+
+static unsigned long long GetCurrentClockTimeMicroseconds(void)
+{
+   static clock_t _ticksPerSecond = 0;
+   if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
+
+   { struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+     return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); }
+}
+
+#define MUTEX_WAIT_TIME_DLEVEL 6
+#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
+    if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {   \
+        unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
+        ZSTD_pthread_mutex_lock(mutex);           \
+        {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
+            unsigned long long const elapsedTime = (afterTime-beforeTime); \
+            if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
+                DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
+                   elapsedTime, #mutex);          \
+        }   }                                     \
+    } else {                                      \
+        ZSTD_pthread_mutex_lock(mutex);           \
+    }                                             \
+}
+
+#else
+
+#  define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
+#  define DEBUG_PRINTHEX(l,p,n) {}
+
+#endif
+
+
+/* =====   Buffer Pool   ===== */
+/* a single Buffer Pool can be invoked from multiple threads in parallel */
+
+typedef struct buffer_s {
+    void* start;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t g_nullBuffer = { NULL, 0 };
+
+typedef struct ZSTDMT_bufferPool_s {
+    ZSTD_pthread_mutex_t poolMutex;
+    size_t bufferSize;
+    unsigned totalBuffers;
+    unsigned nbBuffers;
+    ZSTD_customMem cMem;
+    buffer_t bTable[1];   /* variable size */
+} ZSTDMT_bufferPool;
+
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    unsigned const maxNbBuffers = 2*nbWorkers + 3;
+    ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_calloc(
+        sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem);
+    if (bufPool==NULL) return NULL;
+    if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
+        ZSTD_free(bufPool, cMem);
+        return NULL;
+    }
+    bufPool->bufferSize = 64 KB;
+    bufPool->totalBuffers = maxNbBuffers;
+    bufPool->nbBuffers = 0;
+    bufPool->cMem = cMem;
+    return bufPool;
+}
+
+static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    unsigned u;
+    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
+    if (!bufPool) return;   /* compatibility with free on NULL */
+    for (u=0; u<bufPool->totalBuffers; u++) {
+        DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start);
+        ZSTD_free(bufPool->bTable[u].start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
+    ZSTD_free(bufPool, bufPool->cMem);
+}
+
+/* only works at initialization, not during compression */
+static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const poolSize = sizeof(*bufPool)
+                          + (bufPool->totalBuffers - 1) * sizeof(buffer_t);
+    unsigned u;
+    size_t totalBufferSize = 0;
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    for (u=0; u<bufPool->totalBuffers; u++)
+        totalBufferSize += bufPool->bTable[u].capacity;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+
+    return poolSize + totalBufferSize;
+}
+
+/* ZSTDMT_setBufferSize() :
+ * all future buffers provided by this buffer pool will have _at least_ this size
+ * note : it's better for all buffers to have same size,
+ * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */
+static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize)
+{
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize);
+    bufPool->bufferSize = bSize;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+}
+
+
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers)
+{
+    unsigned const maxNbBuffers = 2*nbWorkers + 3;
+    if (srcBufPool==NULL) return NULL;
+    if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
+        return srcBufPool;
+    /* need a larger buffer pool */
+    {   ZSTD_customMem const cMem = srcBufPool->cMem;
+        size_t const bSize = srcBufPool->bufferSize;   /* forward parameters */
+        ZSTDMT_bufferPool* newBufPool;
+        ZSTDMT_freeBufferPool(srcBufPool);
+        newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+        if (newBufPool==NULL) return newBufPool;
+        ZSTDMT_setBufferSize(newBufPool, bSize);
+        return newBufPool;
+    }
+}
+
+/** ZSTDMT_getBuffer() :
+ *  assumption : bufPool must be valid
+ * @return : a buffer, with start pointer and size
+ *  note: allocation may fail, in this case, start==NULL and size==0 */
+static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const bSize = bufPool->bufferSize;
+    DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers) {   /* try to use an existing buffer */
+        buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)];
+        size_t const availBufferSize = buf.capacity;
+        bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer;
+        if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) {
+            /* large enough, but not too much */
+            DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
+                        bufPool->nbBuffers, (U32)buf.capacity);
+            ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+            return buf;
+        }
+        /* size conditions not respected : scratch this buffer, create new one */
+        DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing");
+        ZSTD_free(buf.start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* create new buffer */
+    DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer");
+    {   buffer_t buffer;
+        void* const start = ZSTD_malloc(bSize, bufPool->cMem);
+        buffer.start = start;   /* note : start can be NULL if malloc fails ! */
+        buffer.capacity = (start==NULL) ? 0 : bSize;
+        if (start==NULL) {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!");
+        } else {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize);
+        }
+        return buffer;
+    }
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+/** ZSTDMT_resizeBuffer() :
+ * assumption : bufPool must be valid
+ * @return : a buffer that is at least the buffer pool buffer size.
+ *           If a reallocation happens, the data in the input buffer is copied.
+ */
+static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer)
+{
+    size_t const bSize = bufPool->bufferSize;
+    if (buffer.capacity < bSize) {
+        void* const start = ZSTD_malloc(bSize, bufPool->cMem);
+        buffer_t newBuffer;
+        newBuffer.start = start;
+        newBuffer.capacity = start == NULL ? 0 : bSize;
+        if (start != NULL) {
+            assert(newBuffer.capacity >= buffer.capacity);
+            memcpy(newBuffer.start, buffer.start, buffer.capacity);
+            DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize);
+            return newBuffer;
+        }
+        DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!");
+    }
+    return buffer;
+}
+#endif
+
+/* store buffer for later re-use, up to pool capacity */
+static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
+{
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers < bufPool->totalBuffers) {
+        bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
+        DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
+                    (U32)buf.capacity, (U32)(bufPool->nbBuffers-1));
+        ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+        return;
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* Reached bufferPool capacity (should not happen) */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
+    ZSTD_free(buf.start, bufPool->cMem);
+}
+
+
+/* =====   Seq Pool Wrapper   ====== */
+
+static rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0};
+
+typedef ZSTDMT_bufferPool ZSTDMT_seqPool;
+
+static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool)
+{
+    return ZSTDMT_sizeof_bufferPool(seqPool);
+}
+
+static rawSeqStore_t bufferToSeq(buffer_t buffer)
+{
+    rawSeqStore_t seq = {NULL, 0, 0, 0};
+    seq.seq = (rawSeq*)buffer.start;
+    seq.capacity = buffer.capacity / sizeof(rawSeq);
+    return seq;
+}
+
+static buffer_t seqToBuffer(rawSeqStore_t seq)
+{
+    buffer_t buffer;
+    buffer.start = seq.seq;
+    buffer.capacity = seq.capacity * sizeof(rawSeq);
+    return buffer;
+}
+
+static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool)
+{
+    if (seqPool->bufferSize == 0) {
+        return kNullRawSeqStore;
+    }
+    return bufferToSeq(ZSTDMT_getBuffer(seqPool));
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq)));
+}
+#endif
+
+static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq));
+}
+
+static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
+{
+  ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq));
+}
+
+static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_setNbSeq(seqPool, 0);
+    return seqPool;
+}
+
+static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
+{
+    ZSTDMT_freeBufferPool(seqPool);
+}
+
+static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
+{
+    return ZSTDMT_expandBufferPool(pool, nbWorkers);
+}
+
+
+/* =====   CCtx Pool   ===== */
+/* a single CCtx Pool can be invoked from multiple threads in parallel */
+
+typedef struct {
+    ZSTD_pthread_mutex_t poolMutex;
+    unsigned totalCCtx;
+    unsigned availCCtx;
+    ZSTD_customMem cMem;
+    ZSTD_CCtx* cctx[1];   /* variable size */
+} ZSTDMT_CCtxPool;
+
+/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
+static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
+{
+    unsigned u;
+    for (u=0; u<pool->totalCCtx; u++)
+        ZSTD_freeCCtx(pool->cctx[u]);  /* note : compatible with free on NULL */
+    ZSTD_pthread_mutex_destroy(&pool->poolMutex);
+    ZSTD_free(pool, pool->cMem);
+}
+
+/* ZSTDMT_createCCtxPool() :
+ * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */
+static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbWorkers,
+                                              ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_calloc(
+        sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem);
+    assert(nbWorkers > 0);
+    if (!cctxPool) return NULL;
+    if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
+        ZSTD_free(cctxPool, cMem);
+        return NULL;
+    }
+    cctxPool->cMem = cMem;
+    cctxPool->totalCCtx = nbWorkers;
+    cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
+    cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem);
+    if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
+    DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
+    return cctxPool;
+}
+
+static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
+                                              unsigned nbWorkers)
+{
+    if (srcPool==NULL) return NULL;
+    if (nbWorkers <= srcPool->totalCCtx) return srcPool;   /* good enough */
+    /* need a larger cctx pool */
+    {   ZSTD_customMem const cMem = srcPool->cMem;
+        ZSTDMT_freeCCtxPool(srcPool);
+        return ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    }
+}
+
+/* only works during initialization phase, not during compression */
+static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
+{
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    {   unsigned const nbWorkers = cctxPool->totalCCtx;
+        size_t const poolSize = sizeof(*cctxPool)
+                                + (nbWorkers-1) * sizeof(ZSTD_CCtx*);
+        unsigned u;
+        size_t totalCCtxSize = 0;
+        for (u=0; u<nbWorkers; u++) {
+            totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctx[u]);
+        }
+        ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+        assert(nbWorkers > 0);
+        return poolSize + totalCCtxSize;
+    }
+}
+
+static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool)
+{
+    DEBUGLOG(5, "ZSTDMT_getCCtx");
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    if (cctxPool->availCCtx) {
+        cctxPool->availCCtx--;
+        {   ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx];
+            ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+            return cctx;
+    }   }
+    ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+    DEBUGLOG(5, "create one more CCtx");
+    return ZSTD_createCCtx_advanced(cctxPool->cMem);   /* note : can be NULL, when creation fails ! */
+}
+
+static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return;   /* compatibility with release on NULL */
+    ZSTD_pthread_mutex_lock(&pool->poolMutex);
+    if (pool->availCCtx < pool->totalCCtx)
+        pool->cctx[pool->availCCtx++] = cctx;
+    else {
+        /* pool overflow : should not happen, since totalCCtx==nbWorkers */
+        DEBUGLOG(4, "CCtx pool overflow : free cctx");
+        ZSTD_freeCCtx(cctx);
+    }
+    ZSTD_pthread_mutex_unlock(&pool->poolMutex);
+}
+
+/* ====   Serial State   ==== */
+
+typedef struct {
+    void const* start;
+    size_t size;
+} range_t;
+
+typedef struct {
+    /* All variables in the struct are protected by mutex. */
+    ZSTD_pthread_mutex_t mutex;
+    ZSTD_pthread_cond_t cond;
+    ZSTD_CCtx_params params;
+    ldmState_t ldmState;
+    XXH64_state_t xxhState;
+    unsigned nextJobID;
+    /* Protects ldmWindow.
+     * Must be acquired after the main mutex when acquiring both.
+     */
+    ZSTD_pthread_mutex_t ldmWindowMutex;
+    ZSTD_pthread_cond_t ldmWindowCond;  /* Signaled when ldmWindow is udpated */
+    ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
+} serialState_t;
+
+static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params, size_t jobSize)
+{
+    /* Adjust parameters */
+    if (params.ldmParams.enableLdm) {
+        DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashEveryLog < 32);
+        serialState->ldmState.hashPower =
+                ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+    } else {
+        memset(&params.ldmParams, 0, sizeof(params.ldmParams));
+    }
+    serialState->nextJobID = 0;
+    if (params.fParams.checksumFlag)
+        XXH64_reset(&serialState->xxhState, 0);
+    if (params.ldmParams.enableLdm) {
+        ZSTD_customMem cMem = params.customMem;
+        unsigned const hashLog = params.ldmParams.hashLog;
+        size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
+        unsigned const bucketLog =
+            params.ldmParams.hashLog - params.ldmParams.bucketSizeLog;
+        size_t const bucketSize = (size_t)1 << bucketLog;
+        unsigned const prevBucketLog =
+            serialState->params.ldmParams.hashLog -
+            serialState->params.ldmParams.bucketSizeLog;
+        /* Size the seq pool tables */
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
+        /* Reset the window */
+        ZSTD_window_clear(&serialState->ldmState.window);
+        serialState->ldmWindow = serialState->ldmState.window;
+        /* Resize tables and output space if necessary. */
+        if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
+            ZSTD_free(serialState->ldmState.hashTable, cMem);
+            serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_malloc(hashSize, cMem);
+        }
+        if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) {
+            ZSTD_free(serialState->ldmState.bucketOffsets, cMem);
+            serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_malloc(bucketSize, cMem);
+        }
+        if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets)
+            return 1;
+        /* Zero the tables */
+        memset(serialState->ldmState.hashTable, 0, hashSize);
+        memset(serialState->ldmState.bucketOffsets, 0, bucketSize);
+    }
+    serialState->params = params;
+    serialState->params.jobSize = (U32)jobSize;
+    return 0;
+}
+
+static int ZSTDMT_serialState_init(serialState_t* serialState)
+{
+    int initError = 0;
+    memset(serialState, 0, sizeof(*serialState));
+    initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL);
+    initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL);
+    return initError;
+}
+
+static void ZSTDMT_serialState_free(serialState_t* serialState)
+{
+    ZSTD_customMem cMem = serialState->params.customMem;
+    ZSTD_pthread_mutex_destroy(&serialState->mutex);
+    ZSTD_pthread_cond_destroy(&serialState->cond);
+    ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex);
+    ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond);
+    ZSTD_free(serialState->ldmState.hashTable, cMem);
+    ZSTD_free(serialState->ldmState.bucketOffsets, cMem);
+}
+
+static void ZSTDMT_serialState_update(serialState_t* serialState,
+                                      ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore,
+                                      range_t src, unsigned jobID)
+{
+    /* Wait for our turn */
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    while (serialState->nextJobID < jobID) {
+        ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
+    }
+    /* A future job may error and skip our job */
+    if (serialState->nextJobID == jobID) {
+        /* It is now our turn, do any processing necessary */
+        if (serialState->params.ldmParams.enableLdm) {
+            size_t error;
+            assert(seqStore.seq != NULL && seqStore.pos == 0 &&
+                   seqStore.size == 0 && seqStore.capacity > 0);
+            assert(src.size <= serialState->params.jobSize);
+            ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
+            error = ZSTD_ldm_generateSequences(
+                &serialState->ldmState, &seqStore,
+                &serialState->params.ldmParams, src.start, src.size);
+            /* We provide a large enough buffer to never fail. */
+            assert(!ZSTD_isError(error)); (void)error;
+            /* Update ldmWindow to match the ldmState.window and signal the main
+             * thread if it is waiting for a buffer.
+             */
+            ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+            serialState->ldmWindow = serialState->ldmState.window;
+            ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+            ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+        }
+        if (serialState->params.fParams.checksumFlag && src.size > 0)
+            XXH64_update(&serialState->xxhState, src.start, src.size);
+    }
+    /* Now it is the next jobs turn */
+    serialState->nextJobID++;
+    ZSTD_pthread_cond_broadcast(&serialState->cond);
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+    if (seqStore.size > 0) {
+        size_t const err = ZSTD_referenceExternalSequences(
+            jobCCtx, seqStore.seq, seqStore.size);
+        assert(serialState->params.ldmParams.enableLdm);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState,
+                                              unsigned jobID, size_t cSize)
+{
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    if (serialState->nextJobID <= jobID) {
+        assert(ZSTD_isError(cSize)); (void)cSize;
+        DEBUGLOG(5, "Skipping past job %u because of error", jobID);
+        serialState->nextJobID = jobID + 1;
+        ZSTD_pthread_cond_broadcast(&serialState->cond);
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+        ZSTD_window_clear(&serialState->ldmWindow);
+        ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+        ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+}
+
+
+/* ------------------------------------------ */
+/* =====          Worker thread         ===== */
+/* ------------------------------------------ */
+
+static const range_t kNullRange = { NULL, 0 };
+
+typedef struct {
+    size_t   consumed;                   /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
+    size_t   cSize;                      /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
+    ZSTD_pthread_mutex_t job_mutex;      /* Thread-safe - used by mtctx and worker */
+    ZSTD_pthread_cond_t job_cond;        /* Thread-safe - used by mtctx and worker */
+    ZSTDMT_CCtxPool* cctxPool;           /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_bufferPool* bufPool;          /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_seqPool* seqPool;             /* Thread-safe - used by mtctx and (all) workers */
+    serialState_t* serial;               /* Thread-safe - used by mtctx and (all) workers */
+    buffer_t dstBuff;                    /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
+    range_t prefix;                      /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t src;                         /* set by mtctx, then read by worker & mtctx => no barrier */
+    unsigned jobID;                      /* set by mtctx, then read by worker => no barrier */
+    unsigned firstJob;                   /* set by mtctx, then read by worker => no barrier */
+    unsigned lastJob;                    /* set by mtctx, then read by worker => no barrier */
+    ZSTD_CCtx_params params;             /* set by mtctx, then read by worker => no barrier */
+    const ZSTD_CDict* cdict;             /* set by mtctx, then read by worker => no barrier */
+    unsigned long long fullFrameSize;    /* set by mtctx, then read by worker => no barrier */
+    size_t   dstFlushed;                 /* used only by mtctx */
+    unsigned frameChecksumNeeded;        /* used only by mtctx */
+} ZSTDMT_jobDescription;
+
+/* ZSTDMT_compressionJob() is a POOL_function type */
+void ZSTDMT_compressionJob(void* jobDescription)
+{
+    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
+    ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
+    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
+    rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
+    buffer_t dstBuff = job->dstBuff;
+
+    /* ressources */
+    if (cctx==NULL) {
+        job->cSize = ERROR(memory_allocation);
+        goto _endJob;
+    }
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) {
+            job->cSize = ERROR(memory_allocation);
+            goto _endJob;
+        }
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+    if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL) {
+        job->cSize = ERROR(memory_allocation);
+        goto _endJob;
+    }
+
+    /* Don't compute the checksum for chunks, since we compute it externally,
+     * but write it in the header.
+     */
+    if (job->jobID != 0) jobParams.fParams.checksumFlag = 0;
+    /* Don't run LDM for the chunks, since we handle it externally */
+    jobParams.ldmParams.enableLdm = 0;
+
+
+    /* init */
+    if (job->cdict) {
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, jobParams, job->fullFrameSize);
+        assert(job->firstJob);  /* only allowed for first job */
+        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+    } else {  /* srcStart points at reloaded section */
+        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
+        {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
+            if (ZSTD_isError(forceWindowError)) {
+                job->cSize = forceWindowError;
+                goto _endJob;
+        }   }
+        {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
+                                        job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        ZSTD_dtlm_fast,
+                                        NULL, /*cdict*/
+                                        jobParams, pledgedSrcSize);
+            if (ZSTD_isError(initError)) {
+                job->cSize = initError;
+                goto _endJob;
+    }   }   }
+
+    /* Perform serial step as early as possible, but after CCtx initialization */
+    ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
+
+    if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
+        size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
+        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
+        DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
+        ZSTD_invalidateRepCodes(cctx);
+    }
+
+    /* compress */
+    {   size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX;
+        int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize);
+        const BYTE* ip = (const BYTE*) job->src.start;
+        BYTE* const ostart = (BYTE*)dstBuff.start;
+        BYTE* op = ostart;
+        BYTE* oend = op + dstBuff.capacity;
+        int chunkNb;
+        if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
+        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
+        assert(job->cSize == 0);
+        for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
+            size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
+            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            ip += chunkSize;
+            op += cSize; assert(op < oend);
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            job->consumed = chunkSize * chunkNb;
+            DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)",
+                        (U32)cSize, (U32)job->cSize);
+            ZSTD_pthread_cond_signal(&job->job_cond);   /* warns some more data is ready to be flushed */
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+        }
+        /* last block */
+        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
+            size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
+            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
+            size_t const cSize = (job->lastJob) ?
+                 ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
+                 ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
+            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+    }   }
+
+_endJob:
+    ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize);
+    if (job->prefix.size > 0)
+        DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
+    DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
+    /* release resources */
+    ZSTDMT_releaseSeq(job->seqPool, rawSeqStore);
+    ZSTDMT_releaseCCtx(job->cctxPool, cctx);
+    /* report */
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+    job->consumed = job->src.size;
+    ZSTD_pthread_cond_signal(&job->job_cond);
+    ZSTD_pthread_mutex_unlock(&job->job_mutex);
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+typedef struct {
+    range_t prefix;         /* read-only non-owned prefix buffer */
+    buffer_t buffer;
+    size_t filled;
+} inBuff_t;
+
+typedef struct {
+  BYTE* buffer;     /* The round input buffer. All jobs get references
+                     * to pieces of the buffer. ZSTDMT_tryGetInputRange()
+                     * handles handing out job input buffers, and makes
+                     * sure it doesn't overlap with any pieces still in use.
+                     */
+  size_t capacity;  /* The capacity of buffer. */
+  size_t pos;       /* The position of the current inBuff in the round
+                     * buffer. Updated past the end if the inBuff once
+                     * the inBuff is sent to the worker thread.
+                     * pos <= capacity.
+                     */
+} roundBuff_t;
+
+static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
+
+struct ZSTDMT_CCtx_s {
+    POOL_ctx* factory;
+    ZSTDMT_jobDescription* jobs;
+    ZSTDMT_bufferPool* bufPool;
+    ZSTDMT_CCtxPool* cctxPool;
+    ZSTDMT_seqPool* seqPool;
+    ZSTD_CCtx_params params;
+    size_t targetSectionSize;
+    size_t targetPrefixSize;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
+    inBuff_t inBuff;
+    roundBuff_t roundBuff;
+    serialState_t serial;
+    unsigned singleBlockingThread;
+    unsigned jobIDMask;
+    unsigned doneJobID;
+    unsigned nextJobID;
+    unsigned frameEnded;
+    unsigned allJobsCompleted;
+    unsigned long long frameContentSize;
+    unsigned long long consumed;
+    unsigned long long produced;
+    ZSTD_customMem cMem;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+};
+
+static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem)
+{
+    U32 jobNb;
+    if (jobTable == NULL) return;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex);
+        ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond);
+    }
+    ZSTD_free(jobTable, cMem);
+}
+
+/* ZSTDMT_allocJobsTable()
+ * allocate and init a job table.
+ * update *nbJobsPtr to next power of 2 value, as size of table */
+static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_customMem cMem)
+{
+    U32 const nbJobsLog2 = ZSTD_highbit32(*nbJobsPtr) + 1;
+    U32 const nbJobs = 1 << nbJobsLog2;
+    U32 jobNb;
+    ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*)
+                ZSTD_calloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem);
+    int initError = 0;
+    if (jobTable==NULL) return NULL;
+    *nbJobsPtr = nbJobs;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        initError |= ZSTD_pthread_mutex_init(&jobTable[jobNb].job_mutex, NULL);
+        initError |= ZSTD_pthread_cond_init(&jobTable[jobNb].job_cond, NULL);
+    }
+    if (initError != 0) {
+        ZSTDMT_freeJobsTable(jobTable, nbJobs, cMem);
+        return NULL;
+    }
+    return jobTable;
+}
+
+static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
+    U32 nbJobs = nbWorkers + 2;
+    if (nbJobs > mtctx->jobIDMask+1) {  /* need more job capacity */
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0));  /* ensure nbJobs is a power of 2 */
+        mtctx->jobIDMask = nbJobs - 1;
+    }
+    return 0;
+}
+
+
+/* ZSTDMT_CCtxParam_setNbWorkers():
+ * Internal use only */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
+{
+    if (nbWorkers > ZSTDMT_NBWORKERS_MAX) nbWorkers = ZSTDMT_NBWORKERS_MAX;
+    params->nbWorkers = nbWorkers;
+    params->overlapSizeLog = ZSTDMT_OVERLAPLOG_DEFAULT;
+    params->jobSize = 0;
+    return nbWorkers;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtx* mtctx;
+    U32 nbJobs = nbWorkers + 2;
+    int initError;
+    DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers);
+
+    if (nbWorkers < 1) return NULL;
+    nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX);
+    if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL))
+        /* invalid custom allocator */
+        return NULL;
+
+    mtctx = (ZSTDMT_CCtx*) ZSTD_calloc(sizeof(ZSTDMT_CCtx), cMem);
+    if (!mtctx) return NULL;
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    mtctx->cMem = cMem;
+    mtctx->allJobsCompleted = 1;
+    mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem);
+    mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
+    assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0);  /* ensure nbJobs is a power of 2 */
+    mtctx->jobIDMask = nbJobs - 1;
+    mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
+    initError = ZSTDMT_serialState_init(&mtctx->serial);
+    mtctx->roundBuff = kNullRoundBuff;
+    if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) {
+        ZSTDMT_freeCCtx(mtctx);
+        return NULL;
+    }
+    DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers);
+    return mtctx;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers)
+{
+    return ZSTDMT_createCCtx_advanced(nbWorkers, ZSTD_defaultCMem);
+}
+
+
+/* ZSTDMT_releaseAllJobResources() :
+ * note : ensure all workers are killed first ! */
+static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
+{
+    unsigned jobID;
+    DEBUGLOG(3, "ZSTDMT_releaseAllJobResources");
+    for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) {
+        DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start);
+        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cSize = 0;
+    }
+    memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription));
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->allJobsCompleted = 1;
+}
+
+static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
+{
+    DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted");
+    while (mtctx->doneJobID < mtctx->nextJobID) {
+        unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+        mtctx->doneJobID++;
+    }
+}
+
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx==NULL) return 0;   /* compatible with free on NULL */
+    POOL_free(mtctx->factory);   /* stop and free worker threads */
+    ZSTDMT_releaseAllJobResources(mtctx);  /* release job resources into pools first */
+    ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+    ZSTDMT_freeBufferPool(mtctx->bufPool);
+    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
+    ZSTDMT_freeSeqPool(mtctx->seqPool);
+    ZSTDMT_serialState_free(&mtctx->serial);
+    ZSTD_freeCDict(mtctx->cdictLocal);
+    if (mtctx->roundBuff.buffer)
+        ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
+    ZSTD_free(mtctx, mtctx->cMem);
+    return 0;
+}
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx == NULL) return 0;   /* supports sizeof NULL */
+    return sizeof(*mtctx)
+            + POOL_sizeof(mtctx->factory)
+            + ZSTDMT_sizeof_bufferPool(mtctx->bufPool)
+            + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription)
+            + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool)
+            + ZSTDMT_sizeof_seqPool(mtctx->seqPool)
+            + ZSTD_sizeof_CDict(mtctx->cdictLocal)
+            + mtctx->roundBuff.capacity;
+}
+
+/* Internal only */
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params,
+                                ZSTDMT_parameter parameter, unsigned value) {
+    DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter");
+    switch(parameter)
+    {
+    case ZSTDMT_p_jobSize :
+        DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter : set jobSize to %u", value);
+        if ( (value > 0)  /* value==0 => automatic job size */
+           & (value < ZSTDMT_JOBSIZE_MIN) )
+            value = ZSTDMT_JOBSIZE_MIN;
+        if (value > ZSTDMT_JOBSIZE_MAX)
+            value = ZSTDMT_JOBSIZE_MAX;
+        params->jobSize = value;
+        return value;
+    case ZSTDMT_p_overlapSectionLog :
+        if (value > 9) value = 9;
+        DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value);
+        params->overlapSizeLog = (value >= 9) ? 9 : value;
+        return value;
+    default :
+        return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value)
+{
+    DEBUGLOG(4, "ZSTDMT_setMTCtxParameter");
+    switch(parameter)
+    {
+    case ZSTDMT_p_jobSize :
+        return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value);
+    case ZSTDMT_p_overlapSectionLog :
+        return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value);
+    default :
+        return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value)
+{
+    switch (parameter) {
+    case ZSTDMT_p_jobSize:
+        *value = mtctx->params.jobSize;
+        break;
+    case ZSTDMT_p_overlapSectionLog:
+        *value = mtctx->params.overlapSizeLog;
+        break;
+    default:
+        return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
+/* Sets parameters relevant to the compression job,
+ * initializing others to default values. */
+static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params)
+{
+    ZSTD_CCtx_params jobParams;
+    memset(&jobParams, 0, sizeof(jobParams));
+
+    jobParams.cParams = params.cParams;
+    jobParams.fParams = params.fParams;
+    jobParams.compressionLevel = params.compressionLevel;
+
+    return jobParams;
+}
+
+
+/* ZSTDMT_resize() :
+ * @return : error code if fails, 0 on success */
+static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
+{
+    if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbWorkers) );
+    mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers);
+    if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
+    mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
+    if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
+    mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
+    if (mtctx->seqPool == NULL) return ERROR(memory_allocation);
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    return 0;
+}
+
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
+{
+    U32 const saved_wlog = mtctx->params.cParams.windowLog;   /* Do not modify windowLog while compressing */
+    int const compressionLevel = cctxParams->compressionLevel;
+    DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
+                compressionLevel);
+    mtctx->params.compressionLevel = compressionLevel;
+    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, 0, 0);
+        cParams.windowLog = saved_wlog;
+        mtctx->params.cParams = cParams;
+    }
+}
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ * Note : mutex will be acquired during statistics collection. */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
+{
+    ZSTD_frameProgression fps;
+    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
+    fps.consumed = mtctx->consumed;
+    fps.produced = mtctx->produced;
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    {   unsigned jobNb;
+        unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
+        DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
+                    mtctx->doneJobID, lastJobNb, mtctx->jobReady)
+        for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
+            unsigned const wJobID = jobNb & mtctx->jobIDMask;
+            ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex);
+            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
+                size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+                fps.consumed += mtctx->jobs[wJobID].consumed;
+                fps.ingested += mtctx->jobs[wJobID].src.size;
+                fps.produced += produced;
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        }
+    }
+    return fps;
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+static size_t ZSTDMT_computeTargetJobLog(ZSTD_CCtx_params const params)
+{
+    if (params.ldmParams.enableLdm)
+        return MAX(21, params.cParams.chainLog + 4);
+    return MAX(20, params.cParams.windowLog + 2);
+}
+
+static size_t ZSTDMT_computeOverlapLog(ZSTD_CCtx_params const params)
+{
+    unsigned const overlapRLog = (params.overlapSizeLog>9) ? 0 : 9-params.overlapSizeLog;
+    if (params.ldmParams.enableLdm)
+        return (MIN(params.cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) - overlapRLog);
+    return overlapRLog >= 9 ? 0 : (params.cParams.windowLog - overlapRLog);
+}
+
+static unsigned ZSTDMT_computeNbJobs(ZSTD_CCtx_params params, size_t srcSize, unsigned nbWorkers) {
+    assert(nbWorkers>0);
+    {   size_t const jobSizeTarget = (size_t)1 << ZSTDMT_computeTargetJobLog(params);
+        size_t const jobMaxSize = jobSizeTarget << 2;
+        size_t const passSizeMax = jobMaxSize * nbWorkers;
+        unsigned const multiplier = (unsigned)(srcSize / passSizeMax) + 1;
+        unsigned const nbJobsLarge = multiplier * nbWorkers;
+        unsigned const nbJobsMax = (unsigned)(srcSize / jobSizeTarget) + 1;
+        unsigned const nbJobsSmall = MIN(nbJobsMax, nbWorkers);
+        return (multiplier>1) ? nbJobsLarge : nbJobsSmall;
+}   }
+
+/* ZSTDMT_compress_advanced_internal() :
+ * This is a blocking function : it will only give back control to caller after finishing its compression job.
+ */
+static size_t ZSTDMT_compress_advanced_internal(
+                ZSTDMT_CCtx* mtctx,
+                void* dst, size_t dstCapacity,
+          const void* src, size_t srcSize,
+          const ZSTD_CDict* cdict,
+                ZSTD_CCtx_params params)
+{
+    ZSTD_CCtx_params const jobParams = ZSTDMT_initJobCCtxParams(params);
+    size_t const overlapSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
+    unsigned const nbJobs = ZSTDMT_computeNbJobs(params, srcSize, params.nbWorkers);
+    size_t const proposedJobSize = (srcSize + (nbJobs-1)) / nbJobs;
+    size_t const avgJobSize = (((proposedJobSize-1) & 0x1FFFF) < 0x7FFF) ? proposedJobSize + 0xFFFF : proposedJobSize;   /* avoid too small last block */
+    const char* const srcStart = (const char*)src;
+    size_t remainingSrcSize = srcSize;
+    unsigned const compressWithinDst = (dstCapacity >= ZSTD_compressBound(srcSize)) ? nbJobs : (unsigned)(dstCapacity / ZSTD_compressBound(avgJobSize));  /* presumes avgJobSize >= 256 KB, which should be the case */
+    size_t frameStartPos = 0, dstBufferPos = 0;
+    assert(jobParams.nbWorkers == 0);
+    assert(mtctx->cctxPool->totalCCtx == params.nbWorkers);
+
+    params.jobSize = (U32)avgJobSize;
+    DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: nbJobs=%2u (rawSize=%u bytes; fixedSize=%u) ",
+                nbJobs, (U32)proposedJobSize, (U32)avgJobSize);
+
+    if ((nbJobs==1) | (params.nbWorkers<=1)) {   /* fallback to single-thread mode : this is a blocking invocation anyway */
+        ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0];
+        DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: fallback to single-thread mode");
+        if (cdict) return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, jobParams.fParams);
+        return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, NULL, 0, jobParams);
+    }
+
+    assert(avgJobSize >= 256 KB);  /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) );
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize))
+        return ERROR(memory_allocation);
+
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbJobs) );  /* only expands if necessary */
+
+    {   unsigned u;
+        for (u=0; u<nbJobs; u++) {
+            size_t const jobSize = MIN(remainingSrcSize, avgJobSize);
+            size_t const dstBufferCapacity = ZSTD_compressBound(jobSize);
+            buffer_t const dstAsBuffer = { (char*)dst + dstBufferPos, dstBufferCapacity };
+            buffer_t const dstBuffer = u < compressWithinDst ? dstAsBuffer : g_nullBuffer;
+            size_t dictSize = u ? overlapSize : 0;
+
+            mtctx->jobs[u].prefix.start = srcStart + frameStartPos - dictSize;
+            mtctx->jobs[u].prefix.size = dictSize;
+            mtctx->jobs[u].src.start = srcStart + frameStartPos;
+            mtctx->jobs[u].src.size = jobSize; assert(jobSize > 0);  /* avoid job.src.size == 0 */
+            mtctx->jobs[u].consumed = 0;
+            mtctx->jobs[u].cSize = 0;
+            mtctx->jobs[u].cdict = (u==0) ? cdict : NULL;
+            mtctx->jobs[u].fullFrameSize = srcSize;
+            mtctx->jobs[u].params = jobParams;
+            /* do not calculate checksum within sections, but write it in header for first section */
+            mtctx->jobs[u].dstBuff = dstBuffer;
+            mtctx->jobs[u].cctxPool = mtctx->cctxPool;
+            mtctx->jobs[u].bufPool = mtctx->bufPool;
+            mtctx->jobs[u].seqPool = mtctx->seqPool;
+            mtctx->jobs[u].serial = &mtctx->serial;
+            mtctx->jobs[u].jobID = u;
+            mtctx->jobs[u].firstJob = (u==0);
+            mtctx->jobs[u].lastJob = (u==nbJobs-1);
+
+            DEBUGLOG(5, "ZSTDMT_compress_advanced_internal: posting job %u  (%u bytes)", u, (U32)jobSize);
+            DEBUG_PRINTHEX(6, mtctx->jobs[u].prefix.start, 12);
+            POOL_add(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[u]);
+
+            frameStartPos += jobSize;
+            dstBufferPos += dstBufferCapacity;
+            remainingSrcSize -= jobSize;
+    }   }
+
+    /* collect result */
+    {   size_t error = 0, dstPos = 0;
+        unsigned jobID;
+        for (jobID=0; jobID<nbJobs; jobID++) {
+            DEBUGLOG(5, "waiting for job %u ", jobID);
+            ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+            while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+                DEBUGLOG(5, "waiting for jobCompleted signal from job %u", jobID);
+                ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+            DEBUGLOG(5, "ready to write job %u ", jobID);
+
+            {   size_t const cSize = mtctx->jobs[jobID].cSize;
+                if (ZSTD_isError(cSize)) error = cSize;
+                if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall);
+                if (jobID) {   /* note : job 0 is written directly at dst, which is correct position */
+                    if (!error)
+                        memmove((char*)dst + dstPos, mtctx->jobs[jobID].dstBuff.start, cSize);  /* may overlap when job compressed within dst */
+                    if (jobID >= compressWithinDst) {  /* job compressed into its own buffer, which must be released */
+                        DEBUGLOG(5, "releasing buffer %u>=%u", jobID, compressWithinDst);
+                        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+                }   }
+                mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[jobID].cSize = 0;
+                dstPos += cSize ;
+            }
+        }  /* for (jobID=0; jobID<nbJobs; jobID++) */
+
+        DEBUGLOG(4, "checksumFlag : %u ", params.fParams.checksumFlag);
+        if (params.fParams.checksumFlag) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            if (dstPos + 4 > dstCapacity) {
+                error = ERROR(dstSize_tooSmall);
+            } else {
+                DEBUGLOG(4, "writing checksum : %08X \n", checksum);
+                MEM_writeLE32((char*)dst + dstPos, checksum);
+                dstPos += 4;
+        }   }
+
+        if (!error) DEBUGLOG(4, "compressed size : %u  ", (U32)dstPos);
+        return error ? error : dstPos;
+    }
+}
+
+size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const ZSTD_CDict* cdict,
+                               ZSTD_parameters params,
+                               unsigned overlapLog)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    cctxParams.overlapSizeLog = overlapLog;
+    return ZSTDMT_compress_advanced_internal(mtctx,
+                                             dst, dstCapacity,
+                                             src, srcSize,
+                                             cdict, cctxParams);
+}
+
+
+size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                           void* dst, size_t dstCapacity,
+                     const void* src, size_t srcSize,
+                           int compressionLevel)
+{
+    U32 const overlapLog = (compressionLevel >= ZSTD_maxCLevel()) ? 9 : ZSTDMT_OVERLAPLOG_DEFAULT;
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0);
+    params.fParams.contentSizeFlag = 1;
+    return ZSTDMT_compress_advanced(mtctx, dst, dstCapacity, src, srcSize, NULL, params, overlapLog);
+}
+
+
+/* ====================================== */
+/* =======      Streaming API     ======= */
+/* ====================================== */
+
+size_t ZSTDMT_initCStream_internal(
+        ZSTDMT_CCtx* mtctx,
+        const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
+        unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
+
+    /* params supposed partially fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    /* init */
+    if (params.nbWorkers != mtctx->params.nbWorkers)
+        CHECK_F( ZSTDMT_resize(mtctx, params.nbWorkers) );
+
+    if (params.jobSize > 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN;
+    if (params.jobSize > ZSTDMT_JOBSIZE_MAX) params.jobSize = ZSTDMT_JOBSIZE_MAX;
+
+    mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN);  /* do not trigger multi-threading when srcSize is too small */
+    if (mtctx->singleBlockingThread) {
+        ZSTD_CCtx_params const singleThreadParams = ZSTDMT_initJobCCtxParams(params);
+        DEBUGLOG(5, "ZSTDMT_initCStream_internal: switch to single blocking thread mode");
+        assert(singleThreadParams.nbWorkers == 0);
+        return ZSTD_initCStream_internal(mtctx->cctxPool->cctx[0],
+                                         dict, dictSize, cdict,
+                                         singleThreadParams, pledgedSrcSize);
+    }
+
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers);
+
+    if (mtctx->allJobsCompleted == 0) {   /* previous compression not correctly finished */
+        ZSTDMT_waitForAllJobsCompleted(mtctx);
+        ZSTDMT_releaseAllJobResources(mtctx);
+        mtctx->allJobsCompleted = 1;
+    }
+
+    mtctx->params = params;
+    mtctx->frameContentSize = pledgedSrcSize;
+    if (dict) {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                                    ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */
+                                                    params.cParams, mtctx->cMem);
+        mtctx->cdict = mtctx->cdictLocal;
+        if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = NULL;
+        mtctx->cdict = cdict;
+    }
+
+    mtctx->targetPrefixSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
+    DEBUGLOG(4, "overlapLog=%u => %u KB", params.overlapSizeLog, (U32)(mtctx->targetPrefixSize>>10));
+    mtctx->targetSectionSize = params.jobSize;
+    if (mtctx->targetSectionSize == 0) {
+        mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(params);
+    }
+    if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
+    DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize);
+    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
+    {
+        /* If ldm is enabled we need windowSize space. */
+        size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0;
+        /* Two buffers of slack, plus extra space for the overlap
+         * This is the minimum slack that LDM works with. One extra because
+         * flush might waste up to targetSectionSize-1 bytes. Another extra
+         * for the overlap (if > 0), then one to fill which doesn't overlap
+         * with the LDM window.
+         */
+        size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0);
+        size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers;
+        /* Compute the total size, and always have enough slack */
+        size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
+        size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers;
+        size_t const capacity = MAX(windowSize, sectionsSize) + slackSize;
+        if (mtctx->roundBuff.capacity < capacity) {
+            if (mtctx->roundBuff.buffer)
+                ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
+            mtctx->roundBuff.buffer = (BYTE*)ZSTD_malloc(capacity, mtctx->cMem);
+            if (mtctx->roundBuff.buffer == NULL) {
+                mtctx->roundBuff.capacity = 0;
+                return ERROR(memory_allocation);
+            }
+            mtctx->roundBuff.capacity = capacity;
+        }
+    }
+    DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10));
+    mtctx->roundBuff.pos = 0;
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->inBuff.prefix = kNullRange;
+    mtctx->doneJobID = 0;
+    mtctx->nextJobID = 0;
+    mtctx->frameEnded = 0;
+    mtctx->allJobsCompleted = 0;
+    mtctx->consumed = 0;
+    mtctx->produced = 0;
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize))
+        return ERROR(memory_allocation);
+    return 0;
+}
+
+size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params,
+                                   unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;  /* retrieve sticky params */
+    DEBUGLOG(4, "ZSTDMT_initCStream_advanced (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    return ZSTDMT_initCStream_internal(mtctx, dict, dictSize, ZSTD_dct_auto, NULL,
+                                       cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;
+    if (cdict==NULL) return ERROR(dictionary_wrong);   /* method incompatible with NULL cdict */
+    cctxParams.cParams = ZSTD_getCParamsFromCDict(cdict);
+    cctxParams.fParams = fParams;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0 /*dictSize*/, ZSTD_dct_auto, cdict,
+                                       cctxParams, pledgedSrcSize);
+}
+
+
+/* ZSTDMT_resetCStream() :
+ * pledgedSrcSize can be zero == unknown (for the time being)
+ * prefer using ZSTD_CONTENTSIZE_UNKNOWN,
+ * as `0` might mean "empty" in the future */
+size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize)
+{
+    if (!pledgedSrcSize) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, 0, mtctx->params,
+                                       pledgedSrcSize);
+}
+
+size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel) {
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+    ZSTD_CCtx_params cctxParams = mtctx->params;   /* retrieve sticky params */
+    DEBUGLOG(4, "ZSTDMT_initCStream (cLevel=%i)", compressionLevel);
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+
+/* ZSTDMT_writeLastEmptyBlock()
+ * Write a single empty block with an end-of-frame to finish a frame.
+ * Job must be created from streaming variant.
+ * This function is always successfull if expected conditions are fulfilled.
+ */
+static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job)
+{
+    assert(job->lastJob == 1);
+    assert(job->src.size == 0);   /* last job is empty -> will be simplified into a last empty block */
+    assert(job->firstJob == 0);   /* cannot be first job, as it also needs to create frame header */
+    assert(job->dstBuff.start == NULL);   /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
+    job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
+    if (job->dstBuff.start == NULL) {
+      job->cSize = ERROR(memory_allocation);
+      return;
+    }
+    assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
+    job->src = kNullRange;
+    job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
+    assert(!ZSTD_isError(job->cSize));
+    assert(job->consumed == 0);
+}
+
+static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp)
+{
+    unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask;
+    int const endFrame = (endOp == ZSTD_e_end);
+
+    if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full");
+        assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask));
+        return 0;
+    }
+
+    if (!mtctx->jobReady) {
+        BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start;
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
+                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
+        mtctx->jobs[jobID].src.start = src;
+        mtctx->jobs[jobID].src.size = srcSize;
+        assert(mtctx->inBuff.filled >= srcSize);
+        mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
+        mtctx->jobs[jobID].consumed = 0;
+        mtctx->jobs[jobID].cSize = 0;
+        mtctx->jobs[jobID].params = mtctx->params;
+        mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL;
+        mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize;
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cctxPool = mtctx->cctxPool;
+        mtctx->jobs[jobID].bufPool = mtctx->bufPool;
+        mtctx->jobs[jobID].seqPool = mtctx->seqPool;
+        mtctx->jobs[jobID].serial = &mtctx->serial;
+        mtctx->jobs[jobID].jobID = mtctx->nextJobID;
+        mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
+        mtctx->jobs[jobID].lastJob = endFrame;
+        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
+        mtctx->jobs[jobID].dstFlushed = 0;
+
+        /* Update the round buffer pos and clear the input buffer to be reset */
+        mtctx->roundBuff.pos += srcSize;
+        mtctx->inBuff.buffer = g_nullBuffer;
+        mtctx->inBuff.filled = 0;
+        /* Set the prefix */
+        if (!endFrame) {
+            size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
+            mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
+            mtctx->inBuff.prefix.size = newPrefixSize;
+        } else {   /* endFrame==1 => no need for another input buffer */
+            mtctx->inBuff.prefix = kNullRange;
+            mtctx->frameEnded = endFrame;
+            if (mtctx->nextJobID == 0) {
+                /* single job exception : checksum is already calculated directly within worker thread */
+                mtctx->params.fParams.checksumFlag = 0;
+        }   }
+
+        if ( (srcSize == 0)
+          && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) {
+            DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame");
+            assert(endOp == ZSTD_e_end);  /* only possible case : need to end the frame with an empty last block */
+            ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID);
+            mtctx->nextJobID++;
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
+                mtctx->nextJobID,
+                (U32)mtctx->jobs[jobID].src.size,
+                mtctx->jobs[jobID].lastJob,
+                mtctx->nextJobID,
+                jobID);
+    if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) {
+        mtctx->nextJobID++;
+        mtctx->jobReady = 0;
+    } else {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID);
+        mtctx->jobReady = 1;
+    }
+    return 0;
+}
+
+
+/*! ZSTDMT_flushProduced() :
+ * `output` : `pos` will be updated with amount of data flushed .
+ * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
+ * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
+static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end)
+{
+    unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask;
+    DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)",
+                blockToFlush, mtctx->doneJobID, mtctx->nextJobID);
+    assert(output->size >= output->pos);
+
+    ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+    if (  blockToFlush
+      && (mtctx->doneJobID < mtctx->nextJobID) ) {
+        assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
+        while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) {  /* nothing to flush */
+            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
+                DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
+                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
+                break;
+            }
+            DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+            ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex);  /* block when nothing to flush but some to come */
+    }   }
+
+    /* try to flush something */
+    {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
+        size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        if (ZSTD_isError(cSize)) {
+            DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
+                        mtctx->doneJobID, ZSTD_getErrorName(cSize));
+            ZSTDMT_waitForAllJobsCompleted(mtctx);
+            ZSTDMT_releaseAllJobResources(mtctx);
+            return cSize;
+        }
+        /* add frame checksum if necessary (can only happen once) */
+        assert(srcConsumed <= srcSize);
+        if ( (srcConsumed == srcSize)   /* job completed -> worker no longer active */
+          && mtctx->jobs[wJobID].frameChecksumNeeded ) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum);
+            MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum);
+            cSize += 4;
+            mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
+            mtctx->jobs[wJobID].frameChecksumNeeded = 0;
+        }
+        if (cSize > 0) {   /* compression is ongoing or completed */
+            size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
+            DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
+                        (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize);
+            assert(mtctx->doneJobID < mtctx->nextJobID);
+            assert(cSize >= mtctx->jobs[wJobID].dstFlushed);
+            assert(mtctx->jobs[wJobID].dstBuff.start != NULL);
+            memcpy((char*)output->dst + output->pos,
+                   (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed,
+                   toFlush);
+            output->pos += toFlush;
+            mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
+
+            if ( (srcConsumed == srcSize)    /* job completed */
+              && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
+                DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+                ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
+                mtctx->consumed += srcSize;
+                mtctx->produced += cSize;
+                mtctx->doneJobID++;
+        }   }
+
+        /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */
+        if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed);
+        if (srcSize > srcConsumed) return 1;   /* current job not completely compressed */
+    }
+    if (mtctx->doneJobID < mtctx->nextJobID) return 1;   /* some more jobs ongoing */
+    if (mtctx->jobReady) return 1;      /* one job is ready to push, just not yet in the list */
+    if (mtctx->inBuff.filled > 0) return 1;   /* input is not empty, and still needs to be converted into a job */
+    mtctx->allJobsCompleted = mtctx->frameEnded;   /* all jobs are entirely flushed => if this one is last one, frame is completed */
+    if (end == ZSTD_e_end) return !mtctx->frameEnded;  /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */
+    return 0;   /* internal buffers fully flushed */
+}
+
+/**
+ * Returns the range of data used by the earliest job that is not yet complete.
+ * If the data of the first job is broken up into two segments, we cover both
+ * sections.
+ */
+static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
+{
+    unsigned const firstJobID = mtctx->doneJobID;
+    unsigned const lastJobID = mtctx->nextJobID;
+    unsigned jobID;
+
+    for (jobID = firstJobID; jobID < lastJobID; ++jobID) {
+        unsigned const wJobID = jobID & mtctx->jobIDMask;
+        size_t consumed;
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+        consumed = mtctx->jobs[wJobID].consumed;
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+
+        if (consumed < mtctx->jobs[wJobID].src.size) {
+            range_t range = mtctx->jobs[wJobID].prefix;
+            if (range.size == 0) {
+                /* Empty prefix */
+                range = mtctx->jobs[wJobID].src;
+            }
+            /* Job source in multiple segments not supported yet */
+            assert(range.start <= mtctx->jobs[wJobID].src.start);
+            return range;
+        }
+    }
+    return kNullRange;
+}
+
+/**
+ * Returns non-zero iff buffer and range overlap.
+ */
+static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
+{
+    BYTE const* const bufferStart = (BYTE const*)buffer.start;
+    BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+    BYTE const* const rangeStart = (BYTE const*)range.start;
+    BYTE const* const rangeEnd = rangeStart + range.size;
+
+    if (rangeStart == NULL || bufferStart == NULL)
+        return 0;
+    /* Empty ranges cannot overlap */
+    if (bufferStart == bufferEnd || rangeStart == rangeEnd)
+        return 0;
+
+    return bufferStart < rangeEnd && rangeStart < bufferEnd;
+}
+
+static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
+{
+    range_t extDict;
+    range_t prefix;
+
+    extDict.start = window.dictBase + window.lowLimit;
+    extDict.size = window.dictLimit - window.lowLimit;
+
+    prefix.start = window.base + window.dictLimit;
+    prefix.size = window.nextSrc - (window.base + window.dictLimit);
+    DEBUGLOG(5, "extDict [0x%zx, 0x%zx)",
+                (size_t)extDict.start,
+                (size_t)extDict.start + extDict.size);
+    DEBUGLOG(5, "prefix  [0x%zx, 0x%zx)",
+                (size_t)prefix.start,
+                (size_t)prefix.start + prefix.size);
+
+    return ZSTDMT_isOverlapped(buffer, extDict)
+        || ZSTDMT_isOverlapped(buffer, prefix);
+}
+
+static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
+{
+    if (mtctx->params.ldmParams.enableLdm) {
+        ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
+                    (size_t)buffer.start,
+                    (size_t)buffer.start + buffer.capacity);
+        ZSTD_PTHREAD_MUTEX_LOCK(mutex);
+        while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
+            DEBUGLOG(6, "Waiting for LDM to finish...");
+            ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
+        }
+        DEBUGLOG(6, "Done waiting for LDM to finish");
+        ZSTD_pthread_mutex_unlock(mutex);
+    }
+}
+
+/**
+ * Attempts to set the inBuff to the next section to fill.
+ * If any part of the new section is still in use we give up.
+ * Returns non-zero if the buffer is filled.
+ */
+static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
+{
+    range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
+    size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
+    size_t const target = mtctx->targetSectionSize;
+    buffer_t buffer;
+
+    assert(mtctx->inBuff.buffer.start == NULL);
+    assert(mtctx->roundBuff.capacity >= target);
+
+    if (spaceLeft < target) {
+        /* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
+         * Simply copy the prefix to the beginning in that case.
+         */
+        BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
+        size_t const prefixSize = mtctx->inBuff.prefix.size;
+
+        buffer.start = start;
+        buffer.capacity = prefixSize;
+        if (ZSTDMT_isOverlapped(buffer, inUse)) {
+            DEBUGLOG(6, "Waiting for buffer...");
+            return 0;
+        }
+        ZSTDMT_waitForLdmComplete(mtctx, buffer);
+        memmove(start, mtctx->inBuff.prefix.start, prefixSize);
+        mtctx->inBuff.prefix.start = start;
+        mtctx->roundBuff.pos = prefixSize;
+    }
+    buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
+    buffer.capacity = target;
+
+    if (ZSTDMT_isOverlapped(buffer, inUse)) {
+        DEBUGLOG(6, "Waiting for buffer...");
+        return 0;
+    }
+    assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
+
+    ZSTDMT_waitForLdmComplete(mtctx, buffer);
+
+    DEBUGLOG(5, "Using prefix range [%zx, %zx)",
+                (size_t)mtctx->inBuff.prefix.start,
+                (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
+    DEBUGLOG(5, "Using source range [%zx, %zx)",
+                (size_t)buffer.start,
+                (size_t)buffer.start + buffer.capacity);
+
+
+    mtctx->inBuff.buffer = buffer;
+    mtctx->inBuff.filled = 0;
+    assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
+    return 1;
+}
+
+
+/** ZSTDMT_compressStream_generic() :
+ *  internal use only - exposed to be invoked from zstd_compress.c
+ *  assumption : output and input are valid (pos <= size)
+ * @return : minimum amount of data remaining to flush, 0 if none */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp)
+{
+    unsigned forwardInputProgress = 0;
+    DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
+                (U32)endOp, (U32)(input->size - input->pos));
+    assert(output->pos <= output->size);
+    assert(input->pos  <= input->size);
+
+    if (mtctx->singleBlockingThread) {  /* delegate to single-thread (synchronous) */
+        return ZSTD_compressStream_generic(mtctx->cctxPool->cctx[0], output, input, endOp);
+    }
+
+    if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) {
+        /* current frame being ended. Only flush/end are allowed */
+        return ERROR(stage_wrong);
+    }
+
+    /* single-pass shortcut (note : synchronous-mode) */
+    if ( (mtctx->nextJobID == 0)      /* just started */
+      && (mtctx->inBuff.filled == 0)  /* nothing buffered */
+      && (!mtctx->jobReady)           /* no job already created */
+      && (endOp == ZSTD_e_end)        /* end order */
+      && (output->size - output->pos >= ZSTD_compressBound(input->size - input->pos)) ) { /* enough space in dst */
+        size_t const cSize = ZSTDMT_compress_advanced_internal(mtctx,
+                (char*)output->dst + output->pos, output->size - output->pos,
+                (const char*)input->src + input->pos, input->size - input->pos,
+                mtctx->cdict, mtctx->params);
+        if (ZSTD_isError(cSize)) return cSize;
+        input->pos = input->size;
+        output->pos += cSize;
+        mtctx->allJobsCompleted = 1;
+        mtctx->frameEnded = 1;
+        return 0;
+    }
+
+    /* fill input buffer */
+    if ( (!mtctx->jobReady)
+      && (input->size > input->pos) ) {   /* support NULL input */
+        if (mtctx->inBuff.buffer.start == NULL) {
+            assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
+            if (!ZSTDMT_tryGetInputRange(mtctx)) {
+                /* It is only possible for this operation to fail if there are
+                 * still compression jobs ongoing.
+                 */
+                assert(mtctx->doneJobID != mtctx->nextJobID);
+            }
+        }
+        if (mtctx->inBuff.buffer.start != NULL) {
+            size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
+            assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
+            DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
+                        (U32)toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
+            memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, toLoad);
+            input->pos += toLoad;
+            mtctx->inBuff.filled += toLoad;
+            forwardInputProgress = toLoad>0;
+        }
+        if ((input->pos < input->size) && (endOp == ZSTD_e_end))
+            endOp = ZSTD_e_flush;   /* can't end now : not all input consumed */
+    }
+
+    if ( (mtctx->jobReady)
+      || (mtctx->inBuff.filled >= mtctx->targetSectionSize)  /* filled enough : let's compress */
+      || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))  /* something to flush : let's go */
+      || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) {   /* must finish the frame with a zero-size block */
+        size_t const jobSize = mtctx->inBuff.filled;
+        assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
+        CHECK_F( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) );
+    }
+
+    /* check for potential compressed data ready to be flushed */
+    {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
+        if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        return remainingToFlush;
+    }
+}
+
+
+size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    CHECK_F( ZSTDMT_compressStream_generic(mtctx, output, input, ZSTD_e_continue) );
+
+    /* recommended next input size : fill current input buffer */
+    return mtctx->targetSectionSize - mtctx->inBuff.filled;   /* note : could be zero when input buffer is fully filled and no more availability to create new job */
+}
+
+
+static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_EndDirective endFrame)
+{
+    size_t const srcSize = mtctx->inBuff.filled;
+    DEBUGLOG(5, "ZSTDMT_flushStream_internal");
+
+    if ( mtctx->jobReady     /* one job ready for a worker to pick up */
+      || (srcSize > 0)       /* still some data within input buffer */
+      || ((endFrame==ZSTD_e_end) && !mtctx->frameEnded)) {  /* need a last 0-size block to end frame */
+           DEBUGLOG(5, "ZSTDMT_flushStream_internal : create a new job (%u bytes, end:%u)",
+                        (U32)srcSize, (U32)endFrame);
+        CHECK_F( ZSTDMT_createCompressionJob(mtctx, srcSize, endFrame) );
+    }
+
+    /* check if there is any data available to flush */
+    return ZSTDMT_flushProduced(mtctx, output, 1 /* blockToFlush */, endFrame);
+}
+
+
+size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output)
+{
+    DEBUGLOG(5, "ZSTDMT_flushStream");
+    if (mtctx->singleBlockingThread)
+        return ZSTD_flushStream(mtctx->cctxPool->cctx[0], output);
+    return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_flush);
+}
+
+size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output)
+{
+    DEBUGLOG(4, "ZSTDMT_endStream");
+    if (mtctx->singleBlockingThread)
+        return ZSTD_endStream(mtctx->cctxPool->cctx[0], output);
+    return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_end);
+}
diff --git a/deps/SZ/zstd/compress/zstdmt_compress.h b/deps/SZ/zstd/compress/zstdmt_compress.h
new file mode 100644
index 0000000000000000000000000000000000000000..34a475a42bffc0ed216b1f33e947270930375916
--- /dev/null
+++ b/deps/SZ/zstd/compress/zstdmt_compress.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ *        Some methods are still exposed (ZSTDLIB_API),
+ *        because it used to be the only way to invoke MT compression.
+ *        Now, it's recommended to use ZSTD_compress_generic() instead.
+ *        These methods will stop being exposed in a future version */
+
+/* ===   Dependencies   === */
+#include <stddef.h>                /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "zstd.h"            /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
+
+/* ===   Memory management   === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                                    ZSTD_customMem cMem);
+ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+
+/* ===   Simple one-pass compression function   === */
+
+ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                       int compressionLevel);
+
+
+
+/* ===   Streaming functions   === */
+
+ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);  /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
+
+ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);   /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);     /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+
+
+/* ===   Advanced functions and parameters  === */
+
+#ifndef ZSTDMT_JOBSIZE_MIN
+#  define ZSTDMT_JOBSIZE_MIN (1U << 20)   /* 1 MB - Minimum size of each compression job */
+#endif
+
+ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const ZSTD_CDict* cdict,
+                                           ZSTD_parameters params,
+                                           unsigned overlapLog);
+
+ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+                                        const void* dict, size_t dictSize,   /* dict can be released after init, a local copy is preserved within zcs */
+                                        ZSTD_parameters params,
+                                        unsigned long long pledgedSrcSize);  /* pledgedSrcSize is optional and can be zero == unknown */
+
+ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+                                        const ZSTD_CDict* cdict,
+                                        ZSTD_frameParameters fparams,
+                                        unsigned long long pledgedSrcSize);  /* note : zero means empty */
+
+/* ZSTDMT_parameter :
+ * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
+typedef enum {
+    ZSTDMT_p_jobSize,           /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
+    ZSTDMT_p_overlapSectionLog  /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
+} ZSTDMT_parameter;
+
+/* ZSTDMT_setMTCtxParameter() :
+ * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter.
+ * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__
+ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value);
+
+/* ZSTDMT_getMTCtxParameter() :
+ * Query the ZSTDMT_CCtx for a parameter value.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value);
+
+
+/*! ZSTDMT_compressStream_generic() :
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ *           0 if fully flushed
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
+ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                                ZSTD_outBuffer* output,
+                                                ZSTD_inBuffer* input,
+                                                ZSTD_EndDirective endOp);
+
+
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */
+
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
+
+/* ZSTDMT_CCtxParam_setNbWorkers()
+ * Set nbWorkers, and clamp it.
+ * Also reset jobSize and overlapLog */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+/*! ZSTDMT_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */
diff --git a/deps/SZ/zstd/decompress/huf_decompress.c b/deps/SZ/zstd/decompress/huf_decompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..a696261bd6383785195e45205fdd5168a57807fc
--- /dev/null
+++ b/deps/SZ/zstd/decompress/huf_decompress.c
@@ -0,0 +1,1096 @@
+/* ******************************************************************
+   huff0 huffman decoder,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include "compiler.h"
+#include "bitstream.h"  /* BIT_* */
+#include "fse.h"        /* to compress headers */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decoding */
+
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
+
+    U32* rankVal;
+    BYTE* huffWeight;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Calculate starting value for each rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 const current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 u;
+            HUF_DEltX1 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (u = rankVal[w]; u < rankVal[w] + length; u++)
+                dt[u] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX1_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = BIT_DStream_unfinished;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+            BIT_reloadDStream(&bitD1);
+            BIT_reloadDStream(&bitD2);
+            BIT_reloadDStream(&bitD3);
+            BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+#if DYNAMIC_BMI2
+
+#define HUF_DGEN(fn)                                                               \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define HUF_DGEN(fn)                                                               \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
+
+
+
+size_t HUF_decompress1X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
+                                                workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX2 DElt;
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+            const U32 symbol = sortedSymbols[s].symbol;
+            const U32 weight = sortedSymbols[s].weight;
+            const U32 nbBits = nbBitsBaseline - weight;
+            const U32 length = 1 << (sizeLog-nbBits);
+            const U32 start = rankVal[weight];
+            U32 i = start;
+            const U32 end = start + length;
+
+            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+            DElt.nbBits = (BYTE)(nbBits + consumed);
+            DElt.length = 2;
+            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+            rankVal[weight] += length;
+    }   }
+}
+
+
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUF_DEltX2 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 const end = start + length;
+                U32 u;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src,
+                             size_t srcSize, void* workSpace,
+                             size_t wkspSize)
+{
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    U32 *rankStart;
+
+    rankValCol_t* rankVal;
+    U32* rankStats;
+    U32* rankStart0;
+    sortedSymbol_t* sortedSymbol;
+    BYTE* weightList;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+    rankStats = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 1;
+    rankStart0 = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 2;
+    sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+    spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    rankStart = rankStart0 + 1;
+    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX2(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128*1024);
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+}   }
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    }
+}
diff --git a/deps/SZ/zstd/decompress/zstd_decompress.c b/deps/SZ/zstd/decompress/zstd_decompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..8f4589d13938df181170904f92c04e207b9f0ded
--- /dev/null
+++ b/deps/SZ/zstd/decompress/zstd_decompress.c
@@ -0,0 +1,3040 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ *  MAXWINDOWSIZE_DEFAULT :
+ *  maximum window size accepted by DStream __by default__.
+ *  Frames requiring more memory will be rejected.
+ *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
+#endif
+
+
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+#include "cpu.h"
+#include "mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_internal.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "zstd_legacy.h"
+#endif
+
+
+/*-*************************************
+*  Errors
+***************************************/
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+
+typedef struct {
+    U32 fastMode;
+    U32 tableLog;
+} ZSTD_seqSymbol_header;
+
+typedef struct {
+    U16  nextState;
+    BYTE nbAdditionalBits;
+    BYTE nbBits;
+    U32  baseValue;
+} ZSTD_seqSymbol;
+
+#define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyDTables_t;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    const void* previousDstEnd;   /* detect continuity */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    U32 dictID;
+    ZSTD_format_e format;
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+
+    /* streaming */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+    U32 hostageByte;
+    int noForwardProgress;
+
+    /* workspace */
+    BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*dctx)
+           + ZSTD_sizeof_DDict(dctx->ddictLocal)
+           + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+    size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
+                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix;
+    ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
+    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+    assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+    return startingInputLength;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+    dctx->format = ZSTD_f_zstd1;  /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */
+    dctx->staticSize  = 0;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    dctx->ddict       = NULL;
+    dctx->ddictLocal  = NULL;
+    dctx->inBuff      = NULL;
+    dctx->inBuffSize  = 0;
+    dctx->outBuffSize = 0;
+    dctx->streamStage = zdss_init;
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+    dctx->noForwardProgress = 0;
+    dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL;  /* minimum size */
+
+    ZSTD_initDCtx_internal(dctx);
+    dctx->staticSize = workspaceSize;
+    dctx->inBuff = (char*)(dctx+1);
+    return dctx;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
+        if (!dctx) return NULL;
+        dctx->customMem = customMem;
+        ZSTD_initDCtx_internal(dctx);
+        return dctx;
+    }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    DEBUGLOG(3, "ZSTD_createDCtx");
+    return ZSTD_createDCtx_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    if (dctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static DCtx */
+    {   ZSTD_customMem const cMem = dctx->customMem;
+        ZSTD_freeDDict(dctx->ddictLocal);
+        dctx->ddictLocal = NULL;
+        ZSTD_free(dctx->inBuff, cMem);
+        dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->legacyContext)
+            ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+        ZSTD_free(dctx, cMem);
+        return 0;
+    }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+    memcpy(dstDCtx, srcDCtx, toCopy);  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+ *   Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_frameIdSize) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ *  srcSize must be large enough to reach header size fields.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ *           or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+    if (srcSize < minInputSize) return ERROR(srcSize_wrong);
+
+    {   BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return minInputSize + !singleSegment
+             + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+             + (singleSegment && !fcsId);
+    }
+}
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+
+    if (srcSize < minInputSize) return minInputSize;
+
+    if ( (format != ZSTD_f_zstd1_magicless)
+      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            /* skippable frame */
+            if (srcSize < ZSTD_skippableHeaderSize)
+                return ZSTD_skippableHeaderSize; /* magic number + frame length */
+            memset(zfhPtr, 0, sizeof(*zfhPtr));
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameType = ZSTD_skippableFrame;
+            return 0;
+        }
+        return ERROR(prefix_unknown);
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    {   size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+        if (srcSize < fhsize) return fhsize;
+        zfhPtr->headerSize = (U32)fhsize;
+    }
+
+    {   BYTE const fhdByte = ip[minInputSize-1];
+        size_t pos = minInputSize;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U64 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+        if ((fhdByte & 0x08) != 0)
+            return ERROR(frameParameter_unsupported); /* reserved bits, must be zero */
+
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            if (windowLog > ZSTD_WINDOWLOG_MAX)
+                return ERROR(frameParameter_windowTooLarge);
+            windowSize = (1ULL << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+        switch(dictIDSizeCode)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (singleSegment) windowSize = frameContentSize;
+
+        zfhPtr->frameType = ZSTD_frame;
+        zfhPtr->frameContentSize = frameContentSize;
+        zfhPtr->windowSize = windowSize;
+        zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+        zfhPtr->dictID = dictID;
+        zfhPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameContentSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {   ZSTD_frameHeader zfh;
+        if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+            return ZSTD_CONTENTSIZE_ERROR;
+        if (zfh.frameType == ZSTD_skippableFrame) {
+            return 0;
+        } else {
+            return zfh.frameContentSize;
+    }   }
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long totalDstSize = 0;
+
+    while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+        U32 const magicNumber = MEM_readLE32(src);
+
+        if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            size_t skippableSize;
+            if (srcSize < ZSTD_skippableHeaderSize)
+                return ERROR(srcSize_wrong);
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+                          + ZSTD_skippableHeaderSize;
+            if (srcSize < skippableSize) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + skippableSize;
+            srcSize -= skippableSize;
+            continue;
+        }
+
+        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+            /* check for overflow */
+            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+            totalDstSize += ret;
+        }
+        {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+            if (ZSTD_isError(frameSrcSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + frameSrcSize;
+            srcSize -= frameSrcSize;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+    return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+*   compatible with legacy mode
+*   @return : decompressed size if known, 0 otherwise
+              note : 0 can mean any of the following :
+                   - frame content is empty
+                   - decompressed size field is not present in frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+    return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+*   `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
+    if (ZSTD_isError(result)) return result;    /* invalid header */
+    if (result>0) return ERROR(srcSize_wrong);  /* headerSize too small */
+    if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
+        return ERROR(dictionary_wrong);
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return 0;
+}
+
+
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
+/*! ZSTD_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
+{
+    if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        if (bpPtr->blockType == bt_reserved) return ERROR(corruption_detected);
+        return cSize;
+    }
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               size_t regenSize)
+{
+    if (srcSize != 1) return ERROR(srcSize_wrong);
+    if (regenSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, *(const BYTE*)src, regenSize);
+    return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+ * @return : nb of bytes read from src (< srcSize )
+ *  note : symbol not declared but exposed for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
+            /* fall-through */
+        case set_compressed:
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + (istart[4] << 10);
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+                if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+                if (HUF_isError((litEncType==set_repeat) ?
+                                    ( singleStream ?
+                                        HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
+                                        HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
+                                    ( singleStream ?
+                                        HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                        HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                    return ERROR(corruption_detected);
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                    memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+                memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            return ERROR(corruption_detected);   /* impossible */
+        }
+    }
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+     /* nextState, nbAddBits, nbBits, baseVal */
+     {  0,  0,  4,    0},  { 16,  0,  4,    0},
+     { 32,  0,  5,    1},  {  0,  0,  5,    3},
+     {  0,  0,  5,    4},  {  0,  0,  5,    6},
+     {  0,  0,  5,    7},  {  0,  0,  5,    9},
+     {  0,  0,  5,   10},  {  0,  0,  5,   12},
+     {  0,  0,  6,   14},  {  0,  1,  5,   16},
+     {  0,  1,  5,   20},  {  0,  1,  5,   22},
+     {  0,  2,  5,   28},  {  0,  3,  5,   32},
+     {  0,  4,  5,   48},  { 32,  6,  5,   64},
+     {  0,  7,  5,  128},  {  0,  8,  6,  256},
+     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
+     { 32,  0,  4,    0},  {  0,  0,  4,    1},
+     {  0,  0,  5,    2},  { 32,  0,  5,    4},
+     {  0,  0,  5,    5},  { 32,  0,  5,    7},
+     {  0,  0,  5,    8},  { 32,  0,  5,   10},
+     {  0,  0,  5,   11},  {  0,  0,  6,   13},
+     { 32,  1,  5,   16},  {  0,  1,  5,   18},
+     { 32,  1,  5,   22},  {  0,  2,  5,   24},
+     { 32,  3,  5,   32},  {  0,  3,  5,   40},
+     {  0,  6,  4,   64},  { 16,  6,  4,   64},
+     { 32,  7,  5,  128},  {  0,  9,  6,  512},
+     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
+     { 16,  0,  4,    1},  { 32,  0,  5,    2},
+     { 32,  0,  5,    3},  { 32,  0,  5,    5},
+     { 32,  0,  5,    6},  { 32,  0,  5,    8},
+     { 32,  0,  5,    9},  { 32,  0,  5,   11},
+     { 32,  0,  5,   12},  {  0,  0,  6,   15},
+     { 32,  1,  5,   18},  { 32,  1,  5,   20},
+     { 32,  2,  5,   24},  { 32,  2,  5,   28},
+     { 32,  3,  5,   40},  { 32,  4,  5,   48},
+     {  0, 16,  6,65536},  {  0, 15,  6,32768},
+     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
+};   /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  5,    0},     {  0,  6,  4,   61},
+    {  0,  9,  5,  509},     {  0, 15,  5,32765},
+    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
+    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
+    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
+    {  0,  5,  5,   29},     {  0,  8,  4,  253},
+    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
+    {  0,  2,  5,    1},     { 16,  7,  4,  125},
+    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
+    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
+    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
+    {  0, 19,  5,524285},    {  0,  1,  5,    1},
+    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
+    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
+    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
+    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
+};   /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  6,    3},  {  0,  0,  4,    4},
+    { 32,  0,  5,    5},  {  0,  0,  5,    6},
+    {  0,  0,  5,    8},  {  0,  0,  5,    9},
+    {  0,  0,  5,   11},  {  0,  0,  6,   13},
+    {  0,  0,  6,   16},  {  0,  0,  6,   19},
+    {  0,  0,  6,   22},  {  0,  0,  6,   25},
+    {  0,  0,  6,   28},  {  0,  0,  6,   31},
+    {  0,  0,  6,   34},  {  0,  1,  6,   37},
+    {  0,  1,  6,   41},  {  0,  2,  6,   47},
+    {  0,  3,  6,   59},  {  0,  4,  6,   83},
+    {  0,  7,  6,  131},  {  0,  9,  6,  515},
+    { 16,  0,  4,    4},  {  0,  0,  4,    5},
+    { 32,  0,  5,    6},  {  0,  0,  5,    7},
+    { 32,  0,  5,    9},  {  0,  0,  5,   10},
+    {  0,  0,  6,   12},  {  0,  0,  6,   15},
+    {  0,  0,  6,   18},  {  0,  0,  6,   21},
+    {  0,  0,  6,   24},  {  0,  0,  6,   27},
+    {  0,  0,  6,   30},  {  0,  0,  6,   33},
+    {  0,  1,  6,   35},  {  0,  1,  6,   39},
+    {  0,  2,  6,   43},  {  0,  3,  6,   51},
+    {  0,  4,  6,   67},  {  0,  5,  6,   99},
+    {  0,  8,  6,  259},  { 32,  0,  4,    4},
+    { 48,  0,  4,    4},  { 16,  0,  4,    5},
+    { 32,  0,  5,    7},  { 32,  0,  5,    8},
+    { 32,  0,  5,   10},  { 32,  0,  5,   11},
+    {  0,  0,  6,   14},  {  0,  0,  6,   17},
+    {  0,  0,  6,   20},  {  0,  0,  6,   23},
+    {  0,  0,  6,   26},  {  0,  0,  6,   29},
+    {  0,  0,  6,   32},  {  0, 16,  6,65539},
+    {  0, 15,  6,32771},  {  0, 14,  6,16387},
+    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
+    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
+};   /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
+{
+    void* ptr = dt;
+    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+    ZSTD_seqSymbol* const cell = dt + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->nbBits = 0;
+    cell->nextState = 0;
+    assert(nbAddBits < 255);
+    cell->nbAdditionalBits = (BYTE)nbAddBits;
+    cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off) */
+static void
+ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+    const short* normalizedCounter, unsigned maxSymbolValue,
+    const U32* baseValue, const U32* nbAdditionalBits,
+    unsigned tableLog)
+{
+    ZSTD_seqSymbol* const tableDecode = dt+1;
+    U16 symbolNext[MaxSeq+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    assert(maxSymbolValue <= MaxSeq);
+    assert(tableLog <= MaxFSELog);
+
+    /* Init, lay down lowprob symbols */
+    {   ZSTD_seqSymbol_header DTableH;
+        DTableH.tableLog = tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].baseValue = s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].baseValue = s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            U32 const symbol = tableDecode[u].baseValue;
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+            assert(nbAdditionalBits[symbol] < 255);
+            tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
+            tableDecode[u].baseValue = baseValue[symbol];
+    }   }
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ *           or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+                                 symbolEncodingType_e type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const U32* baseValue, const U32* nbAdditionalBits,
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case set_rle :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        {   U32 const symbol = *(const BYTE*)src;
+            U32 const baseline = baseValue[symbol];
+            U32 const nbBits = nbAdditionalBits[symbol];
+            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+        }
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = defaultTable;
+        return 0;
+    case set_repeat:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    case set_compressed :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSE_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+        }
+    default :   /* impossible */
+        assert(0);
+        return ERROR(GENERIC);
+    }
+}
+
+static const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static const U32 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip,
+                                                      LL_base, LL_bits,
+                                                      LL_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
+        }
+
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip,
+                                                      OF_base, OF_bits,
+                                                      OF_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
+        }
+
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip,
+                                                      ML_base, ML_bits,
+                                                      ML_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+    const BYTE* match;
+} seq_t;
+
+typedef struct {
+    size_t state;
+    const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    ZSTD_fseState stateLL;
+    ZSTD_fseState stateOffb;
+    ZSTD_fseState stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+    const BYTE* prefixStart;
+    const BYTE* dictEnd;
+    size_t pos;
+} seqState_t;
+
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE* op,
+                              BYTE* const oend, seq_t sequence,
+                              const BYTE** litPtr, const BYTE* const litLimit,
+                              const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd <= oend_w) return ERROR(GENERIC);   /* Precondition */
+
+    /* copy literals */
+    if (op < oend_w) {
+        ZSTD_wildcopy(op, *litPtr, oend_w - op);
+        *litPtr += oend_w - op;
+        op = oend_w;
+    }
+    while (op < oLitEnd) *op++ = *(*litPtr)++;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+    }   }
+    while (op < oMatchEnd) *op++ = *match++;
+    return sequenceLength;
+}
+
+
+HINT_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+                         BYTE* const oend, seq_t sequence,
+                         const BYTE** litPtr, const BYTE* const litLimit,
+                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        if (sequence.offset > (size_t)(oLitEnd - virtualStart))
+            return ERROR(corruption_detected);
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+HINT_INLINE
+size_t ZSTD_execSequenceLong(BYTE* op,
+                             BYTE* const oend, seq_t sequence,
+                             const BYTE** litPtr, const BYTE* const litLimit,
+                             const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = sequence.match;
+
+    /* check */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);  /* note : op <= oLitEnd <= oend_w == oend - 8 */
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - dictStart)) return ERROR(corruption_detected);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    assert(op <= oend_w);
+    assert(sequence.matchLength >= MINMATCH);
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+    const void* ptr = dt;
+    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+                (U32)DStatePtr->state, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
+{
+    ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
+    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
+        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
+        : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+{
+    seq_t seq;
+    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
+    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
+    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
+    U32 const totalBits = llBits+mlBits+ofBits;
+    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
+    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
+    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofBits)
+            offset = 0;
+        else {
+            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+            assert(ofBits <= MaxOff);
+            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+            } else {
+                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+        }
+
+        if (ofBits <= 1) {
+            offset += (llBase==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {  /* offset == 0 */
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = mlBase
+                    + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+        BIT_reloadDStream(&seqState->DStream);
+    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+        BIT_reloadDStream(&seqState->DStream);
+    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    seq.litLength = llBase
+                  + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0);    /* <=  16 bits */
+    if (MEM_32bits())
+        BIT_reloadDStream(&seqState->DStream);
+
+    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+    /* ANS state update */
+    ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            {   seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+        if (nbSeq) return ERROR(corruption_detected);
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
+{
+    seq_t seq;
+    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
+    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
+    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
+    U32 const totalBits = llBits+mlBits+ofBits;
+    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
+    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
+    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofBits)
+            offset = 0;
+        else {
+            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+            assert(ofBits <= MaxOff);
+            if (MEM_32bits() && longOffsets) {
+                U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
+                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+            } else {
+                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+        }
+
+        if (ofBits <= 1) {
+            offset += (llBase==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+        BIT_reloadDStream(&seqState->DStream);
+    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+        BIT_reloadDStream(&seqState->DStream);
+    /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
+    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+    if (MEM_32bits())
+        BIT_reloadDStream(&seqState->DStream);
+
+    {   size_t const pos = seqState->pos + seq.litLength;
+        const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
+        seq.match = matchBase + pos - seq.offset;  /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+                                                    * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
+        seqState->pos = pos + seq.matchLength;
+    }
+
+    /* ANS state update */
+    ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS 4
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        seqState.prefixStart = prefixStart;
+        seqState.pos = (size_t)(op-prefixStart);
+        seqState.dictEnd = dictEnd;
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+        }
+        if (seqNb<seqAdvance) return ERROR(corruption_detected);
+
+        /* decode and decompress */
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            PREFETCH(sequence.match);  /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+            sequences[seqNb&STOSEQ_MASK] = sequence;
+            op += oneSeqSize;
+        }
+        if (seqNb<nbSeq) return ERROR(corruption_detected);
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+#undef STORED_SEQS
+#undef STOSEQ_MASK
+#undef ADVANCED_SEQS
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+#endif
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+    ZSTD_DCtx *dctx, void *dst, size_t maxDstSize,
+    const void *seqStart, size_t seqSize, int nbSeq,
+    const ZSTD_longOffset_e isLongOffset);
+
+static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                const void* seqStart, size_t seqSize, int nbSeq,
+                                const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                                const void* seqStart, size_t seqSize, int nbSeq,
+                                const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+/* ZSTD_getLongOffsetsShare() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ *           compared to maximum possible of (1<<OffFSELog) */
+static unsigned
+ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+{
+    const void* ptr = offTable;
+    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+    const ZSTD_seqSymbol* table = offTable + 1;
+    U32 const max = 1 << tableLog;
+    U32 u, total = 0;
+    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+    assert(max <= (1 << OffFSELog));  /* max not too large */
+    for (u=0; u<max; u++) {
+        if (table[u].nbAdditionalBits > 22) total += 1;
+    }
+
+    assert(tableLog <= OffFSELog);
+    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+
+    return total;
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize, const int frame)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    /* isLongOffset must be true if there are long offsets.
+     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+     * We don't expect that to be the case in 64-bit mode.
+     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     */
+    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+    if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+
+    /* Build Decoding Tables */
+    {   int nbSeq;
+        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        srcSize -= seqHSize;
+
+        if ( (!frame || dctx->fParams.windowSize > (1<<24))
+          && (nbSeq>0) ) {  /* could probably use a larger nbSeq limit */
+            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+            if (shareLongOffsets >= minShare)
+                return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+        }
+
+        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+    }
+}
+
+
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTD_checkContinuity(dctx, dst);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/** ZSTD_insertBlock() :
+    insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    ZSTD_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+    if (length > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, byte, length);
+    return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize))
+        return ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+#endif
+    if ( (srcSize >= ZSTD_skippableHeaderSize)
+      && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        ZSTD_frameHeader zfh;
+
+        /* Extract Frame Header */
+        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+            if (ZSTD_isError(ret)) return ret;
+            if (ret > 0) return ERROR(srcSize_wrong);
+        }
+
+        ip += zfh.headerSize;
+        remainingSize -= zfh.headerSize;
+
+        /* Loop on each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+                return ERROR(srcSize_wrong);
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        if (zfh.checksumFlag) {   /* Final frame content checksum */
+            if (remainingSize < 4) return ERROR(srcSize_wrong);
+            ip += 4;
+            remainingSize -= 4;
+        }
+
+        return ip - ipstart;
+    }
+}
+
+/*! ZSTD_decompressFrame() :
+*   @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                   void* dst, size_t dstCapacity,
+                             const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* ip = (const BYTE*)(*srcPtr);
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t remainingSize = *srcSizePtr;
+
+    /* check */
+    if (remainingSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize)
+        return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (remainingSize < frameHeaderSize+ZSTD_blockHeaderSize)
+            return ERROR(srcSize_wrong);
+        CHECK_F( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) );
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_generateNxBytes(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            return ERROR(corruption_detected);
+        }
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag)
+            XXH64_update(&dctx->xxhState, op, decodedSize);
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        if ((U64)(op-ostart) != dctx->fParams.frameContentSize) {
+            return ERROR(corruption_detected);
+    }   }
+    if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+        U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+        U32 checkRead;
+        if (remainingSize<4) return ERROR(checksum_wrong);
+        checkRead = MEM_readLE32(ip);
+        if (checkRead != checkCalc) return ERROR(checksum_wrong);
+        ip += 4;
+        remainingSize -= 4;
+    }
+
+    /* Allow caller to get size read */
+    *srcPtr = ip;
+    *srcSizePtr = remainingSize;
+    return op-ostart;
+}
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void* dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+    int moreThan1Frame = 0;
+    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
+
+    if (ddict) {
+        dict = ZSTD_DDictDictContent(ddict);
+        dictSize = ZSTD_DDictDictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+            /* legacy support is not compatible with static dctx */
+            if (dctx->staticSize) return ERROR(memory_allocation);
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        {   U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                        (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
+            if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t skippableSize;
+                if (srcSize < ZSTD_skippableHeaderSize)
+                    return ERROR(srcSize_wrong);
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                              + ZSTD_skippableHeaderSize;
+                if (srcSize < skippableSize) return ERROR(srcSize_wrong);
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+        }   }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            CHECK_F(ZSTD_decompressBegin_usingDDict(dctx, ddict));
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+        }
+        ZSTD_checkContinuity(dctx, dst);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            if ( (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+              && (moreThan1Frame==1) ) {
+                /* at least one frame successfully completed,
+                 * but following bytes are garbage :
+                 * it's more likely to be a srcSize error,
+                 * specifying more bytes than compressed size of frame(s).
+                 * This error message replaces ERROR(prefix_unknown),
+                 * which would be confusing, as the first header is actually correct.
+                 * Note that one could be unlucky, it might be a corruption error instead,
+                 * happening right at the place where we expect zstd magic bytes.
+                 * But this is _much_ less likely than a srcSize field error. */
+                return ERROR(srcSize_wrong);
+            }
+            if (ZSTD_isError(res)) return res;
+            /* no need to bound check, ZSTD_decompressFrame already has */
+            dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+        moreThan1Frame = 1;
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+    return (BYTE*)dst - (BYTE*)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+        assert(0);
+    case ZSTDds_getFrameHeaderSize:
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ *            or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (U32)srcSize);
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);  /* not allowed */
+    if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        assert(src != NULL);
+        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
+            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+                memcpy(dctx->headerBuffer, src, srcSize);
+                dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
+                dctx->stage = ZSTDds_decodeSkippableHeader;
+                return 0;
+        }   }
+        dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, srcSize);
+        dctx->expected = dctx->headerSize - srcSize;
+        dctx->stage = ZSTDds_decodeFrameHeader;
+        return 0;
+
+    case ZSTDds_decodeFrameHeader:
+        assert(src != NULL);
+        memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+        CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = ZSTD_blockHeaderSize;  /* jump to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize);
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                return ERROR(corruption_detected);
+            }
+            if (ZSTD_isError(rSize)) return rSize;
+            DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (U32)rSize);
+            dctx->decodedSize += rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (U32)dctx->decodedSize);
+                if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+                    if (dctx->decodedSize != dctx->fParams.frameContentSize) {
+                        return ERROR(corruption_detected);
+                }   }
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+                dctx->previousDstEnd = (char*)dst + rSize;
+            }
+            return rSize;
+        }
+
+    case ZSTDds_checkChecksum:
+        assert(srcSize == 4);  /* guaranteed by dctx->expected */
+        {   U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+            U32 const check32 = MEM_readLE32(src);
+            DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", h32, check32);
+            if (check32 != h32) return ERROR(checksum_wrong);
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+
+    case ZSTDds_decodeSkippableHeader:
+        assert(src != NULL);
+        assert(srcSize <= ZSTD_skippableHeaderSize);
+        memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->stage = ZSTDds_skipFrame;
+        return 0;
+
+    case ZSTDds_skipFrame:
+        dctx->expected = 0;
+        dctx->stage = ZSTDds_getFrameHeaderSize;
+        return 0;
+
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+    return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+
+    {   size_t const hSize = HUF_readDTableX2_wksp(
+            entropy->hufTable, dictPtr, dictEnd - dictPtr,
+            entropy->workspace, sizeof(entropy->workspace));
+        if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->OFTable,
+                            offcodeNCount, offcodeMaxValue,
+                            OF_base, OF_bits,
+                            offcodeLog);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->MLTable,
+                            matchlengthNCount, matchlengthMaxValue,
+                            ML_base, ML_bits,
+                            matchlengthLog);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->LLTable,
+                            litlengthNCount, litlengthMaxValue,
+                            LL_base, LL_bits,
+                            litlengthLog);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            if (rep==0 || rep >= dictContentSize) return ERROR(dictionary_corrupted);
+            entropy->rep[i] = rep;
+    }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+        if (ZSTD_isError(eSize)) return ERROR(dictionary_corrupted);
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+/* Note : this function cannot fail */
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    assert(dctx != NULL);
+    dctx->expected = ZSTD_startingInputLength(dctx->format);  /* dctx->format must be properly set */
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->decodedSize = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (dict && dictSize)
+        CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
+{
+    return ddict->dictContent;
+}
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
+{
+    return ddict->dictSize;
+}
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+{
+    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
+    if (ddict) {   /* support begin on NULL */
+        dstDCtx->dictID = ddict->dictID;
+        dstDCtx->prefixStart = ddict->dictContent;
+        dstDCtx->virtualStart = ddict->dictContent;
+        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+        if (ddict->entropyPresent) {
+            dstDCtx->litEntropy = 1;
+            dstDCtx->fseEntropy = 1;
+            dstDCtx->LLTptr = ddict->entropy.LLTable;
+            dstDCtx->MLTptr = ddict->entropy.MLTable;
+            dstDCtx->OFTptr = ddict->entropy.OFTable;
+            dstDCtx->HUFptr = ddict->entropy.hufTable;
+            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+        } else {
+            dstDCtx->litEntropy = 0;
+            dstDCtx->fseEntropy = 0;
+        }
+    }
+    return 0;
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+
+    /* load entropy tables */
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+    } else {
+        void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    CHECK_F( ZSTD_loadEntropy_inDDict(ddict, dictContentType) );
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+        if (!ddict) return NULL;
+        ddict->cMem = customMem;
+
+        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
+            ZSTD_freeDDict(ddict);
+            return NULL;
+        }
+
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* workspace, size_t workspaceSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace =
+            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
+    assert(workspace != NULL);
+    assert(dict != NULL);
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_free(ddict->dictBuffer, cMem);
+        ZSTD_free(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompresse frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createDStream");
+    return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_freeDCtx(zds);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    ZSTD_freeDDict(dctx->ddictLocal);
+    if (dict && dictSize >= 8) {
+        dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+        if (dctx->ddictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        dctx->ddictLocal = NULL;
+    }
+    dctx->ddict = dctx->ddictLocal;
+    return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType);
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_frameHeaderSize_prefix.
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+    zds->streamStage = zdss_init;
+    zds->noForwardProgress = 0;
+    CHECK_F( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) );
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    DEBUGLOG(4, "ZSTD_initDStream");
+    return ZSTD_initDStream_usingDict(zds, NULL, 0);
+}
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+    size_t const initResult = ZSTD_initDStream(dctx);
+    dctx->ddict = ddict;
+    return initResult;
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_frameHeaderSize_prefix.
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+    DEBUGLOG(4, "ZSTD_resetDStream");
+    dctx->streamStage = zdss_loadHeader;
+    dctx->lhSize = dctx->inPos = dctx->outStart = dctx->outEnd = 0;
+    dctx->legacyVersion = 0;
+    dctx->hostageByte = 0;
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx,
+                                ZSTD_DStreamParameter_e paramType, unsigned paramValue)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    switch(paramType)
+    {
+        default : return ERROR(parameter_unsupported);
+        case DStream_p_maxWindowSize :
+            DEBUGLOG(4, "setting maxWindowSize = %u KB", paramValue >> 10);
+            dctx->maxWindowSize = paramValue ? paramValue : (U32)(-1);
+            break;
+    }
+    return 0;
+}
+
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->maxWindowSize = maxWindowSize;
+    return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+    DEBUGLOG(4, "ZSTD_DCtx_setFormat : %u", (unsigned)format);
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->format = format;
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+    return ZSTD_sizeof_DCtx(dctx);
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
+    unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+    size_t const minRBSize = (size_t) neededSize;
+    if ((unsigned long long)minRBSize != neededSize) return ERROR(frameParameter_windowTooLarge);
+    return minRBSize;
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+    size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    size_t const inBuffSize = blockSize;  /* no block can be larger */
+    size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable */
+    ZSTD_frameHeader zfh;
+    size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    if (err>0) return ERROR(srcSize_wrong);
+    if (zfh.windowSize > windowSizeMax)
+        return ERROR(frameParameter_windowTooLarge);
+    return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* *****   Decompression   ***** */
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const istart = (const char*)(input->src) + input->pos;
+    const char* const iend = (const char*)(input->src) + input->size;
+    const char* ip = istart;
+    char* const ostart = (char*)(output->dst) + output->pos;
+    char* const oend = (char*)(output->dst) + output->size;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+    DEBUGLOG(5, "ZSTD_decompressStream");
+    if (input->pos > input->size) {  /* forbidden */
+        DEBUGLOG(5, "in: pos: %u   vs size: %u",
+                    (U32)input->pos, (U32)input->size);
+        return ERROR(srcSize_wrong);
+    }
+    if (output->pos > output->size) {  /* forbidden */
+        DEBUGLOG(5, "out: pos: %u   vs size: %u",
+                    (U32)output->pos, (U32)output->size);
+        return ERROR(dstSize_tooSmall);
+    }
+    DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+
+    while (someMoreWork) {
+        switch(zds->streamStage)
+        {
+        case zdss_init :
+            DEBUGLOG(5, "stage zdss_init => transparent reset ");
+            ZSTD_resetDStream(zds);   /* transparent reset on starting decoding a new frame */
+            /* fall-through */
+
+        case zdss_loadHeader :
+            DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            if (zds->legacyVersion) {
+                /* legacy support is incompatible with static dctx */
+                if (zds->staticSize) return ERROR(memory_allocation);
+                {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    if (hint==0) zds->streamStage = zdss_init;
+                    return hint;
+            }   }
+#endif
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                    U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        const void* const dict = zds->ddict ? zds->ddict->dictContent : NULL;
+                        size_t const dictSize = zds->ddict ? zds->ddict->dictSize : 0;
+                        DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+                        /* legacy support is incompatible with static dctx */
+                        if (zds->staticSize) return ERROR(memory_allocation);
+                        CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext,
+                                    zds->previousLegacyVersion, legacyVersion,
+                                    dict, dictSize));
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+                            if (hint==0) zds->streamStage = zdss_init;   /* or stay in stage zdss_loadHeader */
+                            return hint;
+                    }   }
+#endif
+                    return hSize;   /* error */
+                }
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    size_t const remainingInput = (size_t)(iend-ip);
+                    assert(iend >= ip);
+                    if (toLoad > remainingInput) {   /* not enough input to load full header */
+                        if (remainingInput > 0) {
+                            memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+                            zds->lhSize += remainingInput;
+                        }
+                        input->pos = input->size;
+                        return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    assert(ip != NULL);
+                    memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart);
+                if (cSize <= (size_t)(iend-istart)) {
+                    /* shortcut : using single-pass mode */
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, zds->ddict);
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    ip = istart + cSize;
+                    op += decompressedSize;
+                    zds->expected = 0;
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Consume header (see ZSTDds_decodeFrameHeader) */
+            DEBUGLOG(4, "Consume header");
+            CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
+
+            if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->stage = ZSTDds_skipFrame;
+            } else {
+                CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
+                zds->expected = ZSTD_blockHeaderSize;
+                zds->stage = ZSTDds_decodeBlockHeader;
+            }
+
+            /* control buffer memory usage */
+            DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+                        (U32)(zds->fParams.windowSize >>10),
+                        (U32)(zds->maxWindowSize >> 10) );
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            if (zds->fParams.windowSize > zds->maxWindowSize) return ERROR(frameParameter_windowTooLarge);
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                size_t const neededOutBuffSize = ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize);
+                if ((zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize)) {
+                    size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+                    DEBUGLOG(4, "inBuff  : from %u to %u",
+                                (U32)zds->inBuffSize, (U32)neededInBuffSize);
+                    DEBUGLOG(4, "outBuff : from %u to %u",
+                                (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+                    if (zds->staticSize) {  /* static DCtx */
+                        DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+                        assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
+                        if (bufferSize > zds->staticSize - sizeof(ZSTD_DCtx))
+                            return ERROR(memory_allocation);
+                    } else {
+                        ZSTD_free(zds->inBuff, zds->customMem);
+                        zds->inBuffSize = 0;
+                        zds->outBuffSize = 0;
+                        zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem);
+                        if (zds->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    zds->inBuffSize = neededInBuffSize;
+                    zds->outBuff = zds->inBuff + zds->inBuffSize;
+                    zds->outBuffSize = neededOutBuffSize;
+            }   }
+            zds->streamStage = zdss_read;
+            /* fall-through */
+
+        case zdss_read:
+            DEBUGLOG(5, "stage zdss_read");
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                    size_t const decodedSize = ZSTD_decompressContinue(zds,
+                        zds->outBuff + zds->outStart, (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart),
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize && !isSkipFrame) break;   /* this was just a header */
+                    zds->outEnd = zds->outStart + decodedSize;
+                    zds->streamStage = zdss_flush;
+                    break;
+            }   }
+            if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+            zds->streamStage = zdss_load;
+            /* fall-through */
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                size_t const toLoad = neededInSize - zds->inPos;
+                int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                size_t loadedSize;
+                if (isSkipFrame) {
+                    loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                } else {
+                    if (toLoad > zds->inBuffSize - zds->inPos) return ERROR(corruption_detected);   /* should never happen */
+                    loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip);
+                }
+                ip += loadedSize;
+                zds->inPos += loadedSize;
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {   size_t const decodedSize = ZSTD_decompressContinue(zds,
+                        zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+                        zds->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zds->inPos = 0;   /* input is consumed */
+                    if (!decodedSize && !isSkipFrame) { zds->streamStage = zdss_read; break; }   /* this was just a header */
+                    zds->outEnd = zds->outStart +  decodedSize;
+            }   }
+            zds->streamStage = zdss_flush;
+            /* fall-through */
+
+        case zdss_flush:
+            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize);
+                op += flushedSize;
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->streamStage = zdss_read;
+                    if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                (int)(zds->outBuffSize - zds->outStart),
+                                (U32)zds->fParams.blockSizeMax);
+                        zds->outStart = zds->outEnd = 0;
+                    }
+                    break;
+            }   }
+            /* cannot complete flush */
+            someMoreWork = 0;
+            break;
+
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            if (op==oend) return ERROR(dstSize_tooSmall);
+            if (ip==iend) return ERROR(srcSize_wrong);
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) {
+                        /* can't release hostage (not present) */
+                        zds->streamStage = zdss_read;
+                        return 1;
+                    }
+                    input->pos++;  /* release hostage */
+                }   /* zds->hostageByte */
+                return 0;
+            }  /* zds->outEnd == zds->outStart */
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }  /* nextSrcSizeHint==0 */
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block);   /* preload header of next block */
+        assert(zds->inPos <= nextSrcSizeHint);
+        nextSrcSizeHint -= zds->inPos;   /* part already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    return ZSTD_decompressStream(dctx, output, input);
+}
+
+size_t ZSTD_decompress_generic_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_decompress_generic(dctx, &output, &input);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+void ZSTD_DCtx_reset(ZSTD_DCtx* dctx)
+{
+    (void)ZSTD_initDStream(dctx);
+    dctx->format = ZSTD_f_zstd1;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+}
diff --git a/deps/SZ/zstd/deprecated/zbuff.h b/deps/SZ/zstd/deprecated/zbuff.h
new file mode 100644
index 0000000000000000000000000000000000000000..a93115da4a1c6d134d5eca36429bf4b211745951
--- /dev/null
+++ b/deps/SZ/zstd/deprecated/zbuff.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* ***************************************************************
+*  NOTES/WARNINGS
+******************************************************************/
+/* The streaming API defined here is deprecated.
+ * Consider migrating towards ZSTD_compressStream() API in `zstd.h`
+ * See 'lib/README.md'.
+ *****************************************************************/
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_BUFFERED_H_23987
+#define ZSTD_BUFFERED_H_23987
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>      /* size_t */
+#include "zstd.h"        /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
+
+
+/* ***************************************************************
+*  Compiler specifics
+*****************************************************************/
+/* Deprecation warnings */
+/* Should these warnings be a problem,
+   it is generally possible to disable them,
+   typically with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
+#  define ZBUFF_DEPRECATED(message) ZSTDLIB_API  /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
+#  elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler")
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API
+#  endif
+#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+/* This is the easier "buffered" streaming API,
+*  using an internal buffer to lift all restrictions on user-provided buffers
+*  which can be any size, any place, for both input and output.
+*  ZBUFF and ZSTD are 100% interoperable,
+*  frames created by one can be decoded by the other one */
+
+typedef ZSTD_CStream ZBUFF_CCtx;
+ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeCStream")   size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initCStream")           size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
+ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+
+ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr);
+ZBUFF_DEPRECATED("use ZSTD_flushStream")    size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+ZBUFF_DEPRECATED("use ZSTD_endStream")      size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+
+/*-*************************************************
+*  Streaming compression - howto
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Start by initializing ZBUF_CCtx.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume input stream.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush().
+*  The nb of bytes written into `dst` will be reported into *dstCapacityPtr.
+*  Note that the function cannot output more than *dstCapacityPtr,
+*  therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  In which case, call again ZBUFF_compressFlush() to complete the flush.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize()
+*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency)
+*  output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
+*  By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
+* **************************************************/
+
+
+typedef ZSTD_DStream ZBUFF_DCtx;
+ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeDStream")   size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initDStream")           size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFF_DCtx object is required to track streaming operations.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            1 when there is still some data left within internal buffer to flush,
+*            >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency),
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize()
+*  output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFF_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZBUFF_DEPRECATED("use ZSTD_isError")      unsigned ZBUFF_isError(size_t errorCode);
+ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZBUFF_DEPRECATED("use ZSTD_CStreamInSize")  size_t ZBUFF_recommendedCInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamInSize")  size_t ZBUFF_recommendedDInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void);
+
+#endif  /* ZSTD_BUFFERED_H_23987 */
+
+
+#ifdef ZBUFF_STATIC_LINKING_ONLY
+#ifndef ZBUFF_STATIC_H_30298098432
+#define ZBUFF_STATIC_H_30298098432
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used in association with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Dependency ---*/
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters, ZSTD_customMem */
+#include "zstd.h"
+
+
+/*--- Custom memory allocator ---*/
+/*! ZBUFF_createCCtx_advanced() :
+ *  Create a ZBUFF compression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZBUFF_createDCtx_advanced() :
+ *  Create a ZBUFF decompression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem);
+
+
+/*--- Advanced Streaming Initialization ---*/
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                               const void* dict, size_t dictSize,
+                                               ZSTD_parameters params, unsigned long long pledgedSrcSize);
+
+
+#endif    /* ZBUFF_STATIC_H_30298098432 */
+#endif    /* ZBUFF_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/deps/SZ/zstd/deprecated/zbuff_common.c b/deps/SZ/zstd/deprecated/zbuff_common.c
new file mode 100644
index 0000000000000000000000000000000000000000..661b9b0e18c5541fbd0f172d3da11c52bdc4ece8
--- /dev/null
+++ b/deps/SZ/zstd/deprecated/zbuff_common.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zbuff.h"
+
+/*-****************************************
+*  ZBUFF Error Management  (deprecated)
+******************************************/
+
+/*! ZBUFF_isError() :
+*   tells if a return value is an error code */
+unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); }
+/*! ZBUFF_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
diff --git a/deps/SZ/zstd/deprecated/zbuff_compress.c b/deps/SZ/zstd/deprecated/zbuff_compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..f39c60d89f60412ed959020b6c4d7cdb1c469698
--- /dev/null
+++ b/deps/SZ/zstd/deprecated/zbuff_compress.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+/*-***********************************************************
+*  Streaming compression
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
+*  Note that it will not output more than *dstCapacityPtr.
+*  Therefore, some content might still be left into its internal buffer if dst buffer is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value.
+*  output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed.
+* ***********************************************************/
+
+ZBUFF_CCtx* ZBUFF_createCCtx(void)
+{
+    return ZSTD_createCStream();
+}
+
+ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createCStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
+{
+    return ZSTD_freeCStream(zbc);
+}
+
+
+/* ======   Initialization   ====== */
+
+size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                   const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* preserve "0 == unknown" behavior */
+    return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize);
+}
+
+
+size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel);
+}
+
+size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
+{
+    return ZSTD_initCStream(zbc, compressionLevel);
+}
+
+/* ======   Compression   ====== */
+
+
+size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
+                              void* dst, size_t* dstCapacityPtr,
+                        const void* src, size_t* srcSizePtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src = src;
+    inBuff.pos = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_compressStream(zbc, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+
+/* ======   Finalize   ====== */
+
+size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_flushStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_endStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedCInSize(void)  { return ZSTD_CStreamInSize(); }
+size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); }
diff --git a/deps/SZ/zstd/deprecated/zbuff_decompress.c b/deps/SZ/zstd/deprecated/zbuff_decompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..923c22b73c5767cea42514d15c4c1a41775a6c40
--- /dev/null
+++ b/deps/SZ/zstd/deprecated/zbuff_decompress.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    return ZSTD_createDStream();
+}
+
+ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_freeDStream(zbd);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    return ZSTD_initDStream_usingDict(zbd, dict, dictSize);
+}
+
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_initDStream(zbd);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    size_t result;
+    outBuff.dst  = dst;
+    outBuff.pos  = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src  = src;
+    inBuff.pos  = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_decompressStream(zbd, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedDInSize(void)  { return ZSTD_DStreamInSize(); }
+size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); }
diff --git a/deps/SZ/zstd/dictBuilder/cover.c b/deps/SZ/zstd/dictBuilder/cover.c
new file mode 100644
index 0000000000000000000000000000000000000000..448f713720fa04fa8c0454759b5f9196b5727843
--- /dev/null
+++ b/deps/SZ/zstd/dictBuilder/cover.c
@@ -0,0 +1,1055 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/*-*************************************
+*  Constants
+***************************************/
+#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+/*-*************************************
+* Hash table
+***************************************
+* A small specialized hash map for storing activeDmers.
+* The map does not resize, so if it becomes full it will loop forever.
+* Thus, the map must be large enough to store every value.
+* The map implements linear probing and keeps its load less than 0.5.
+*/
+
+#define MAP_EMPTY_VALUE ((U32)-1)
+typedef struct COVER_map_pair_t_s {
+  U32 key;
+  U32 value;
+} COVER_map_pair_t;
+
+typedef struct COVER_map_s {
+  COVER_map_pair_t *data;
+  U32 sizeLog;
+  U32 size;
+  U32 sizeMask;
+} COVER_map_t;
+
+/**
+ * Clear the map.
+ */
+static void COVER_map_clear(COVER_map_t *map) {
+  memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
+}
+
+/**
+ * Initializes a map of the given size.
+ * Returns 1 on success and 0 on failure.
+ * The map must be destroyed with COVER_map_destroy().
+ * The map is only guaranteed to be large enough to hold size elements.
+ */
+static int COVER_map_init(COVER_map_t *map, U32 size) {
+  map->sizeLog = ZSTD_highbit32(size) + 2;
+  map->size = (U32)1 << map->sizeLog;
+  map->sizeMask = map->size - 1;
+  map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
+  if (!map->data) {
+    map->sizeLog = 0;
+    map->size = 0;
+    return 0;
+  }
+  COVER_map_clear(map);
+  return 1;
+}
+
+/**
+ * Internal hash function
+ */
+static const U32 prime4bytes = 2654435761U;
+static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
+  return (key * prime4bytes) >> (32 - map->sizeLog);
+}
+
+/**
+ * Helper function that returns the index that a key should be placed into.
+ */
+static U32 COVER_map_index(COVER_map_t *map, U32 key) {
+  const U32 hash = COVER_map_hash(map, key);
+  U32 i;
+  for (i = hash;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *pos = &map->data[i];
+    if (pos->value == MAP_EMPTY_VALUE) {
+      return i;
+    }
+    if (pos->key == key) {
+      return i;
+    }
+  }
+}
+
+/**
+ * Returns the pointer to the value for key.
+ * If key is not in the map, it is inserted and the value is set to 0.
+ * The map must not be full.
+ */
+static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
+  COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
+  if (pos->value == MAP_EMPTY_VALUE) {
+    pos->key = key;
+    pos->value = 0;
+  }
+  return &pos->value;
+}
+
+/**
+ * Deletes key from the map if present.
+ */
+static void COVER_map_remove(COVER_map_t *map, U32 key) {
+  U32 i = COVER_map_index(map, key);
+  COVER_map_pair_t *del = &map->data[i];
+  U32 shift = 1;
+  if (del->value == MAP_EMPTY_VALUE) {
+    return;
+  }
+  for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *const pos = &map->data[i];
+    /* If the position is empty we are done */
+    if (pos->value == MAP_EMPTY_VALUE) {
+      del->value = MAP_EMPTY_VALUE;
+      return;
+    }
+    /* If pos can be moved to del do so */
+    if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
+      del->key = pos->key;
+      del->value = pos->value;
+      del = pos;
+      shift = 1;
+    } else {
+      ++shift;
+    }
+  }
+}
+
+/**
+ * Destroyes a map that is inited with COVER_map_init().
+ */
+static void COVER_map_destroy(COVER_map_t *map) {
+  if (map->data) {
+    free(map->data);
+  }
+  map->data = NULL;
+  map->size = 0;
+}
+
+/*-*************************************
+* Context
+***************************************/
+
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  U32 *suffix;
+  size_t suffixSize;
+  U32 *freqs;
+  U32 *dmerAt;
+  unsigned d;
+} COVER_ctx_t;
+
+/* We need a global context for qsort... */
+static COVER_ctx_t *g_ctx = NULL;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  size_t i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+/**
+ * Returns -1 if the dmer at lp is less than the dmer at rp.
+ * Return 0 if the dmers at lp and rp are equal.
+ * Returns 1 if the dmer at lp is greater than the dmer at rp.
+ */
+static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
+  return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
+
+/**
+ * Same as COVER_cmp() except ties are broken by pointer value
+ * NOTE: g_ctx must be set to call this function.  A global is required because
+ * qsort doesn't take an opaque pointer.
+ */
+static int COVER_strict_cmp(const void *lp, const void *rp) {
+  int result = COVER_cmp(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+
+/**
+ * Returns the first pointer in [first, last) whose element does not compare
+ * less than value.  If no such element exists it returns last.
+ */
+static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
+                                       size_t value) {
+  size_t count = last - first;
+  while (count != 0) {
+    size_t step = count / 2;
+    const size_t *ptr = first;
+    ptr += step;
+    if (*ptr < value) {
+      first = ++ptr;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first;
+}
+
+/**
+ * Generic groupBy function.
+ * Groups an array sorted by cmp into groups with equivalent values.
+ * Calls grp for each group.
+ */
+static void
+COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
+              int (*cmp)(COVER_ctx_t *, const void *, const void *),
+              void (*grp)(COVER_ctx_t *, const void *, const void *)) {
+  const BYTE *ptr = (const BYTE *)data;
+  size_t num = 0;
+  while (num < count) {
+    const BYTE *grpEnd = ptr + size;
+    ++num;
+    while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
+      grpEnd += size;
+      ++num;
+    }
+    grp(ctx, ptr, grpEnd);
+    ptr = grpEnd;
+  }
+}
+
+/*-*************************************
+*  Cover functions
+***************************************/
+
+/**
+ * Called on each group of positions with the same dmer.
+ * Counts the frequency of each dmer and saves it in the suffix array.
+ * Fills `ctx->dmerAt`.
+ */
+static void COVER_group(COVER_ctx_t *ctx, const void *group,
+                        const void *groupEnd) {
+  /* The group consists of all the positions with the same first d bytes. */
+  const U32 *grpPtr = (const U32 *)group;
+  const U32 *grpEnd = (const U32 *)groupEnd;
+  /* The dmerId is how we will reference this dmer.
+   * This allows us to map the whole dmer space to a much smaller space, the
+   * size of the suffix array.
+   */
+  const U32 dmerId = (U32)(grpPtr - ctx->suffix);
+  /* Count the number of samples this dmer shows up in */
+  U32 freq = 0;
+  /* Details */
+  const size_t *curOffsetPtr = ctx->offsets;
+  const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
+  /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
+   * different sample than the last.
+   */
+  size_t curSampleEnd = ctx->offsets[0];
+  for (; grpPtr != grpEnd; ++grpPtr) {
+    /* Save the dmerId for this position so we can get back to it. */
+    ctx->dmerAt[*grpPtr] = dmerId;
+    /* Dictionaries only help for the first reference to the dmer.
+     * After that zstd can reference the match from the previous reference.
+     * So only count each dmer once for each sample it is in.
+     */
+    if (*grpPtr < curSampleEnd) {
+      continue;
+    }
+    freq += 1;
+    /* Binary search to find the end of the sample *grpPtr is in.
+     * In the common case that grpPtr + 1 == grpEnd we can skip the binary
+     * search because the loop is over.
+     */
+    if (grpPtr + 1 != grpEnd) {
+      const size_t *sampleEndPtr =
+          COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
+      curSampleEnd = *sampleEndPtr;
+      curOffsetPtr = sampleEndPtr + 1;
+    }
+  }
+  /* At this point we are never going to look at this segment of the suffix
+   * array again.  We take advantage of this fact to save memory.
+   * We store the frequency of the dmer in the first position of the group,
+   * which is dmerId.
+   */
+  ctx->suffix[dmerId] = freq;
+}
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of dmer d.
+ * Let S_i be the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
+                                           COVER_map_t *activeDmers, U32 begin,
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  COVER_map_clear(activeDmers);
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* The dmerId for the dmer at the next position */
+    U32 newDmer = ctx->dmerAt[activeSegment.end];
+    /* The entry in activeDmers for this dmerId */
+    U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
+    /* If the dmer isn't already present in the segment add its score. */
+    if (*newDmerOcc == 0) {
+      /* The paper suggest using the L-0.5 norm, but experiments show that it
+       * doesn't help.
+       */
+      activeSegment.score += freqs[newDmer];
+    }
+    /* Add the dmer to the segment */
+    activeSegment.end += 1;
+    *newDmerOcc += 1;
+
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      U32 delDmer = ctx->dmerAt[activeSegment.begin];
+      U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
+      activeSegment.begin += 1;
+      *delDmerOcc -= 1;
+      /* If this is the last occurence of the dmer, subtract its score */
+      if (*delDmerOcc == 0) {
+        COVER_map_remove(activeDmers, delDmer);
+        activeSegment.score -= freqs[delDmer];
+      }
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      U32 freq = freqs[ctx->dmerAt[pos]];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /* Zero out the frequency of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      freqs[ctx->dmerAt[pos]] = 0;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int COVER_checkParameters(ZDICT_cover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k and d are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  return 1;
+}
+
+/**
+ * Clean up a context initialized with `COVER_ctx_init()`.
+ */
+static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->suffix) {
+    free(ctx->suffix);
+    ctx->suffix = NULL;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->dmerAt) {
+    free(ctx->dmerAt);
+    ctx->dmerAt = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `COVER_ctx_destroy()`.
+ */
+static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
+               (U32)totalSamplesSize);
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  /* Partial suffix array */
+  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* Maps index to the dmerID */
+  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    COVER_ctx_destroy(ctx);
+    return 0;
+  }
+  ctx->freqs = NULL;
+  ctx->d = d;
+
+  /* Fill offsets from the samlesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+  DISPLAYLEVEL(2, "Constructing partial suffix array\n");
+  {
+    /* suffix is a partial suffix array.
+     * It only sorts suffixes by their first parameters.d bytes.
+     * The sort is stable, so each dmer group is sorted by position in input.
+     */
+    U32 i;
+    for (i = 0; i < ctx->suffixSize; ++i) {
+      ctx->suffix[i] = i;
+    }
+    /* qsort doesn't take an opaque pointer, so pass as a global.
+     * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
+     */
+    g_ctx = ctx;
+#if defined(__OpenBSD__)
+    mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#else
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#endif
+  }
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  /* For each dmer group (group of positions with the same first d bytes):
+   * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
+   *    (groupBeginPtr - suffix).  This allows us to go from position to
+   *    dmerID so we can look up values in freq.
+   * 2. We calculate how many samples the dmer occurs in and save it in
+   *    freqs[dmerId].
+   */
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
+  ctx->freqs = ctx->suffix;
+  ctx->suffix = NULL;
+  return 1;
+}
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
+                                    COVER_map_t *activeDmers, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_cover_params_t parameters) {
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
+  const U32 epochSize = (U32)(ctx->suffixSize / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = COVER_selectSegment(
+        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters)
+{
+  BYTE* const dict = (BYTE*)dictBuffer;
+  COVER_ctx_t ctx;
+  COVER_map_t activeDmers;
+
+  /* Initialize global data */
+  g_displayLevel = parameters.zParams.notificationLevel;
+  /* Checks */
+  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
+    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  /* Initialize context and activeDmers */
+  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                      parameters.d)) {
+    return ERROR(GENERIC);
+  }
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    COVER_ctx_destroy(&ctx);
+    return ERROR(GENERIC);
+  }
+
+  DISPLAYLEVEL(2, "Building dictionary\n");
+  {
+    const size_t tail =
+        COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
+                              dictBufferCapacity, parameters);
+    const size_t dictionarySize = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+    if (!ZSTD_isError(dictionarySize)) {
+      DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                   (U32)dictionarySize);
+    }
+    COVER_ctx_destroy(&ctx);
+    COVER_map_destroy(&activeDmers);
+    return dictionarySize;
+  }
+}
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+static void COVER_best_init(COVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+static void COVER_best_wait(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+static void COVER_best_destroy(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  COVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+static void COVER_best_start(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                              ZDICT_cover_params_t parameters, void *dict,
+                              size_t dictSize) {
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      memcpy(best->dict, dict, dictSize);
+      best->dictSize = dictSize;
+      best->parameters = parameters;
+      best->compressedSize = compressedSize;
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+  }
+}
+
+/**
+ * Parameters for COVER_tryParameters().
+ */
+typedef struct COVER_tryParameters_data_s {
+  const COVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} COVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and upates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void COVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
+  const COVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  COVER_map_t activeDmers;
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    goto _cleanup;
+  }
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
+                                              dictBufferCapacity, parameters);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  {
+    /* Pointers */
+    ZSTD_CCtx *cctx;
+    ZSTD_CDict *cdict;
+    void *dst;
+    /* Local variables */
+    size_t dstCapacity;
+    size_t i;
+    /* Allocate dst with enough space to compress the maximum sized sample */
+    {
+      size_t maxSampleSize = 0;
+      for (i = 0; i < ctx->nbSamples; ++i) {
+        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
+      }
+      dstCapacity = ZSTD_compressBound(maxSampleSize);
+      dst = malloc(dstCapacity);
+    }
+    /* Create the cctx and cdict */
+    cctx = ZSTD_createCCtx();
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
+    if (!dst || !cctx || !cdict) {
+      goto _compressCleanup;
+    }
+    /* Compress each sample and sum their sizes (or error) */
+    totalCompressedSize = dictBufferCapacity;
+    for (i = 0; i < ctx->nbSamples; ++i) {
+      const size_t size = ZSTD_compress_usingCDict(
+          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
+          ctx->samplesSizes[i], cdict);
+      if (ZSTD_isError(size)) {
+        totalCompressedSize = ERROR(GENERIC);
+        goto _compressCleanup;
+      }
+      totalCompressedSize += size;
+    }
+  _compressCleanup:
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeCDict(cdict);
+    if (dst) {
+      free(dst);
+    }
+  }
+
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  COVER_map_destroy(&activeDmers);
+  if (dict) {
+    free(dict);
+  }
+  if (freqs) {
+    free(freqs);
+  }
+}
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters) {
+  /* constants */
+  const unsigned nbThreads = parameters->nbThreads;
+  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+  const unsigned kIterations =
+      (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+  /* Local variables */
+  const int displayLevel = parameters->zParams.notificationLevel;
+  unsigned iteration = 1;
+  unsigned d;
+  unsigned k;
+  COVER_best_t best;
+  POOL_ctx *pool = NULL;
+
+  /* Checks */
+  if (kMinK < kMaxD || kMaxK < kMinK) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  if (nbThreads > 1) {
+    pool = POOL_create(nbThreads, 1);
+    if (!pool) {
+      return ERROR(memory_allocation);
+    }
+  }
+  /* Initialization */
+  COVER_best_init(&best);
+  /* Turn down global display level to clean up display at level 2 and below */
+  g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+  /* Loop through d first because each new value needs a new context */
+  LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                    kIterations);
+  for (d = kMinD; d <= kMaxD; d += 2) {
+    /* Initialize the context for this value of d */
+    COVER_ctx_t ctx;
+    LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return ERROR(GENERIC);
+    }
+    /* Loop through k reusing the same context */
+    for (k = kMinK; k <= kMaxK; k += kStepSize) {
+      /* Prepare the arguments */
+      COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
+          sizeof(COVER_tryParameters_data_t));
+      LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+      if (!data) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+        COVER_best_destroy(&best);
+        COVER_ctx_destroy(&ctx);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      data->ctx = &ctx;
+      data->best = &best;
+      data->dictBufferCapacity = dictBufferCapacity;
+      data->parameters = *parameters;
+      data->parameters.k = k;
+      data->parameters.d = d;
+      data->parameters.steps = kSteps;
+      data->parameters.zParams.notificationLevel = g_displayLevel;
+      /* Check the parameters */
+      if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
+        DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+        free(data);
+        continue;
+      }
+      /* Call the function and pass ownership of data to it */
+      COVER_best_start(&best);
+      if (pool) {
+        POOL_add(pool, &COVER_tryParameters, data);
+      } else {
+        COVER_tryParameters(data);
+      }
+      /* Print status */
+      LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                         (U32)((iteration * 100) / kIterations));
+      ++iteration;
+    }
+    COVER_best_wait(&best);
+    COVER_ctx_destroy(&ctx);
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+  /* Fill the output buffer and parameters with output of the best parameters */
+  {
+    const size_t dictSize = best.dictSize;
+    if (ZSTD_isError(best.compressedSize)) {
+      const size_t compressedSize = best.compressedSize;
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return compressedSize;
+    }
+    *parameters = best.parameters;
+    memcpy(dictBuffer, best.dict, dictSize);
+    COVER_best_destroy(&best);
+    POOL_free(pool);
+    return dictSize;
+  }
+}
diff --git a/deps/SZ/zstd/dictBuilder/divsufsort.c b/deps/SZ/zstd/dictBuilder/divsufsort.c
new file mode 100644
index 0000000000000000000000000000000000000000..60cceb088321ce5b62c297a6e714bafff5ae8e75
--- /dev/null
+++ b/deps/SZ/zstd/dictBuilder/divsufsort.c
@@ -0,0 +1,1913 @@
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4127)    /* C4127 : Condition expression is constant */
+#endif
+
+
+/*- Dependencies -*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "divsufsort.h"
+
+/*- Constants -*/
+#if defined(INLINE)
+# undef INLINE
+#endif
+#if !defined(INLINE)
+# define INLINE __inline
+#endif
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n, int openMP) {
+  int *PAb, *ISAb, *buf;
+#ifdef LIBBSC_OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef LIBBSC_OPENMP
+  int d0, d1;
+#endif
+  (void)openMP;
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef LIBBSC_OPENMP
+    if (openMP)
+    {
+        buf = SA + m;
+        c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1)
+        {
+          bufsize = (n - (2 * m)) / omp_get_num_threads();
+          curbuf = buf + omp_get_thread_num() * bufsize;
+          k = 0;
+          for(;;) {
+            #pragma omp critical(sssort_lock)
+            {
+              if(0 < (l = j)) {
+                d0 = c0, d1 = c1;
+                do {
+                  k = BUCKET_BSTAR(d0, d1);
+                  if(--d1 <= d0) {
+                    d1 = ALPHABET_SIZE - 1;
+                    if(--d0 < 0) { break; }
+                  }
+                } while(((l - k) <= 1) && (0 < (l = k)));
+                c0 = d0, c1 = d1, j = k;
+              }
+            }
+            if(l == 0) { break; }
+            sssort(T, PAb, SA + k, SA + l,
+                   curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+          }
+        }
+    }
+    else
+    {
+        buf = SA + m, bufsize = n - (2 * m);
+        for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+          for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+            i = BUCKET_BSTAR(c0, c1);
+            if(1 < (j - i)) {
+              sssort(T, PAb, SA + i, SA + j,
+                     buf, bufsize, 2, n, *(SA + i) == (m - 1));
+            }
+          }
+        }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT_indexes(const unsigned char *T, int *SA,
+                      int *bucket_A, int *bucket_B,
+                      int n, int m,
+                      unsigned char * num_indexes, int * indexes) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  int mod = n / 8;
+  {
+      mod |= mod >> 1;  mod |= mod >> 2;
+      mod |= mod >> 4;  mod |= mod >> 8;
+      mod |= mod >> 16; mod >>= 1;
+
+      *num_indexes = (unsigned char)((n - 1) / (mod + 1));
+  }
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA;
+
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  if (T[n - 2] < c2) {
+    if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA;
+    *k++ = ~((int)T[n - 2]);
+  }
+  else {
+    *k++ = n - 1;
+  }
+
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+
+      if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA;
+
+      c0 = T[--s];
+      *i = c0;
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      if((0 < s) && (T[s - 1] < c0)) {
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA;
+          *k++ = ~((int)T[s - 1]);
+      } else
+        *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP);
+
+    if (num_indexes == NULL || indexes == NULL) {
+        pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+    } else {
+        pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes);
+    }
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
diff --git a/deps/SZ/zstd/dictBuilder/divsufsort.h b/deps/SZ/zstd/dictBuilder/divsufsort.h
new file mode 100644
index 0000000000000000000000000000000000000000..5440994af15c1bf054207f0dca90dd285bb95aa6
--- /dev/null
+++ b/deps/SZ/zstd/dictBuilder/divsufsort.h
@@ -0,0 +1,67 @@
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string. (can be T)
+ * @param A [0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/deps/SZ/zstd/dictBuilder/zdict.c b/deps/SZ/zstd/dictBuilder/zdict.c
new file mode 100644
index 0000000000000000000000000000000000000000..2024e0bbbd498b728ceeeb087f761f4f42fed3e7
--- /dev/null
+++ b/deps/SZ/zstd/dictBuilder/zdict.c
@@ -0,0 +1,1108 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*-**************************************
+*  Tuning parameters
+****************************************/
+#define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
+#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
+
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  define _LARGEFILE_SOURCE
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  define _LARGEFILE64_SOURCE
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>        /* malloc, free */
+#include <string.h>        /* memset */
+#include <stdio.h>         /* fprintf, fopen, ftello64 */
+#include <time.h>          /* clock */
+
+#include "mem.h"           /* read */
+#include "fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"           /* HUF_buildCTable, HUF_writeCTable */
+#include "zstd_internal.h" /* includes zstd.h */
+#include "xxhash.h"        /* XXH64 */
+#include "divsufsort.h"
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE_DEFAULT 10000
+
+#define NOISELENGTH 32
+
+static const int g_compressionLevel_default = 3;
+static const U32 g_selectivity_default = 9;
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
+#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+
+static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
+
+static void ZDICT_printHex(const void* ptr, size_t length)
+{
+    const BYTE* const b = (const BYTE*)ptr;
+    size_t u;
+    for (u=0; u<length; u++) {
+        BYTE c = b[u];
+        if (c<32 || c>126) c = '.';   /* non-printable char */
+        DISPLAY("%c", c);
+    }
+}
+
+
+/*-********************************************************
+*  Helper functions
+**********************************************************/
+unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dictBuffer + 4);
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static unsigned ZDICT_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+/*! ZDICT_count() :
+    Count the nb of common bytes between 2 pointers.
+    Note : this function presumes end of buffer followed by noisy guard band.
+*/
+static size_t ZDICT_count(const void* pIn, const void* pMatch)
+{
+    const char* const pStart = (const char*)pIn;
+    for (;;) {
+        size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) {
+            pIn = (const char*)pIn+sizeof(size_t);
+            pMatch = (const char*)pMatch+sizeof(size_t);
+            continue;
+        }
+        pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
+        return (size_t)((const char*)pIn - pStart);
+    }
+}
+
+
+typedef struct {
+    U32 pos;
+    U32 length;
+    U32 savings;
+} dictItem;
+
+static void ZDICT_initDictItem(dictItem* d)
+{
+    d->pos = 1;
+    d->length = 0;
+    d->savings = (U32)(-1);
+}
+
+
+#define LLIMIT 64          /* heuristic determined experimentally */
+#define MINMATCHLENGTH 7   /* heuristic determined experimentally */
+static dictItem ZDICT_analyzePos(
+                       BYTE* doneMarks,
+                       const int* suffix, U32 start,
+                       const void* buffer, U32 minRatio, U32 notificationLevel)
+{
+    U32 lengthList[LLIMIT] = {0};
+    U32 cumulLength[LLIMIT] = {0};
+    U32 savings[LLIMIT] = {0};
+    const BYTE* b = (const BYTE*)buffer;
+    size_t maxLength = LLIMIT;
+    size_t pos = suffix[start];
+    U32 end = start;
+    dictItem solution;
+
+    /* init */
+    memset(&solution, 0, sizeof(solution));
+    doneMarks[pos] = 1;
+
+    /* trivial repetition cases */
+    if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
+       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
+       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
+        /* skip and mark segment */
+        U16 const pattern16 = MEM_read16(b+pos+4);
+        U32 u, patternEnd = 6;
+        while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+        if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+        for (u=1; u<patternEnd; u++)
+            doneMarks[pos+u] = 1;
+        return solution;
+    }
+
+    /* look forward */
+    {   size_t length;
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+        } while (length >= MINMATCHLENGTH);
+    }
+
+    /* look backward */
+    {   size_t length;
+        do {
+            length = ZDICT_count(b + pos, b + *(suffix+start-1));
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+    }
+
+    /* exit if not found a minimum nb of repetitions */
+    if (end-start < minRatio) {
+        U32 idx;
+        for(idx=start; idx<end; idx++)
+            doneMarks[suffix[idx]] = 1;
+        return solution;
+    }
+
+    {   int i;
+        U32 searchLength;
+        U32 refinedStart = start;
+        U32 refinedEnd = end;
+
+        DISPLAYLEVEL(4, "\n");
+        DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
+        DISPLAYLEVEL(4, "\n");
+
+        for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
+            BYTE currentChar = 0;
+            U32 currentCount = 0;
+            U32 currentID = refinedStart;
+            U32 id;
+            U32 selectedCount = 0;
+            U32 selectedID = currentID;
+            for (id =refinedStart; id < refinedEnd; id++) {
+                if (b[suffix[id] + searchLength] != currentChar) {
+                    if (currentCount > selectedCount) {
+                        selectedCount = currentCount;
+                        selectedID = currentID;
+                    }
+                    currentID = id;
+                    currentChar = b[ suffix[id] + searchLength];
+                    currentCount = 0;
+                }
+                currentCount ++;
+            }
+            if (currentCount > selectedCount) {  /* for last */
+                selectedCount = currentCount;
+                selectedID = currentID;
+            }
+
+            if (selectedCount < minRatio)
+                break;
+            refinedStart = selectedID;
+            refinedEnd = refinedStart + selectedCount;
+        }
+
+        /* evaluate gain based on new ref */
+        start = refinedStart;
+        pos = suffix[refinedStart];
+        end = start;
+        memset(lengthList, 0, sizeof(lengthList));
+
+        /* look forward */
+        {   size_t length;
+            do {
+                end++;
+                length = ZDICT_count(b + pos, b + suffix[end]);
+                if (length >= LLIMIT) length = LLIMIT-1;
+                lengthList[length]++;
+            } while (length >=MINMATCHLENGTH);
+        }
+
+        /* look backward */
+        {   size_t length = MINMATCHLENGTH;
+            while ((length >= MINMATCHLENGTH) & (start > 0)) {
+                length = ZDICT_count(b + pos, b + suffix[start - 1]);
+                if (length >= LLIMIT) length = LLIMIT - 1;
+                lengthList[length]++;
+                if (length >= MINMATCHLENGTH) start--;
+            }
+        }
+
+        /* largest useful length */
+        memset(cumulLength, 0, sizeof(cumulLength));
+        cumulLength[maxLength-1] = lengthList[maxLength-1];
+        for (i=(int)(maxLength-2); i>=0; i--)
+            cumulLength[i] = cumulLength[i+1] + lengthList[i];
+
+        for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
+        maxLength = i;
+
+        /* reduce maxLength in case of final into repetitive data */
+        {   U32 l = (U32)maxLength;
+            BYTE const c = b[pos + maxLength-1];
+            while (b[pos+l-2]==c) l--;
+            maxLength = l;
+        }
+        if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
+
+        /* calculate savings */
+        savings[5] = 0;
+        for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
+            savings[i] = savings[i-1] + (lengthList[i] * (i-3));
+
+        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+                     (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
+
+        solution.pos = (U32)pos;
+        solution.length = (U32)maxLength;
+        solution.savings = savings[maxLength];
+
+        /* mark positions done */
+        {   U32 id;
+            for (id=start; id<end; id++) {
+                U32 p, pEnd, length;
+                U32 const testedPos = suffix[id];
+                if (testedPos == pos)
+                    length = solution.length;
+                else {
+                    length = (U32)ZDICT_count(b+pos, b+testedPos);
+                    if (length > solution.length) length = solution.length;
+                }
+                pEnd = (U32)(testedPos + length);
+                for (p=testedPos; p<pEnd; p++)
+                    doneMarks[p] = 1;
+    }   }   }
+
+    return solution;
+}
+
+
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+    const char* const ip = (const char*) in;
+    const char* const into = (const char*) container;
+    size_t u;
+
+    for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
+        if (ip[u] != into[u]) break;
+    }
+
+    return u==length;
+}
+
+/*! ZDICT_tryMerge() :
+    check if dictItem can be merged, do it if possible
+    @return : id of destination elt, 0 if not merged
+*/
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
+{
+    const U32 tableSize = table->pos;
+    const U32 eltEnd = elt.pos + elt.length;
+    const char* const buf = (const char*) buffer;
+
+    /* tail overlap */
+    U32 u; for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
+            /* append */
+            U32 const addedLength = table[u].pos - elt.pos;
+            table[u].length += addedLength;
+            table[u].pos = elt.pos;
+            table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            elt = table[u];
+            /* sort : improve rank */
+            while ((u>1) && (table[u-1].savings < elt.savings))
+            table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    /* front overlap */
+    for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+
+        if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
+            /* append */
+            int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            if (addedLength > 0) {   /* otherwise, elt fully included into existing */
+                table[u].length += addedLength;
+                table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            }
+            /* sort : improve rank */
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+        }
+
+        if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+            if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+                size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+                table[u].pos = elt.pos;
+                table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+                table[u].length = MIN(elt.length, table[u].length + 1);
+                return u;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+static void ZDICT_removeDictItem(dictItem* table, U32 id)
+{
+    /* convention : table[0].pos stores nb of elts */
+    U32 const max = table[0].pos;
+    U32 u;
+    if (!id) return;   /* protection, should never happen */
+    for (u=id; u<max-1; u++)
+        table[u] = table[u+1];
+    table->pos--;
+}
+
+
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
+{
+    /* merge if possible */
+    U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
+    if (mergeId) {
+        U32 newMerge = 1;
+        while (newMerge) {
+            newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
+            if (newMerge) ZDICT_removeDictItem(table, mergeId);
+            mergeId = newMerge;
+        }
+        return;
+    }
+
+    /* insert */
+    {   U32 current;
+        U32 nextElt = table->pos;
+        if (nextElt >= maxSize) nextElt = maxSize-1;
+        current = nextElt-1;
+        while (table[current].savings < elt.savings) {
+            table[current+1] = table[current];
+            current--;
+        }
+        table[current+1] = elt;
+        table->pos = nextElt+1;
+    }
+}
+
+
+static U32 ZDICT_dictSize(const dictItem* dictList)
+{
+    U32 u, dictSize = 0;
+    for (u=1; u<dictList[0].pos; u++)
+        dictSize += dictList[u].length;
+    return dictSize;
+}
+
+
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
+                            const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
+                            const size_t* fileSizes, unsigned nbFiles,
+                            U32 minRatio, U32 notificationLevel)
+{
+    int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
+    int* const suffix = suffix0+1;
+    U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
+    BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
+    U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
+    size_t result = 0;
+    clock_t displayClock = 0;
+    clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
+
+#   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
+            if (ZDICT_clockSpan(displayClock) > refreshRate)  \
+            { displayClock = clock(); DISPLAY(__VA_ARGS__); \
+            if (notificationLevel>=4) fflush(stderr); } }
+
+    /* init */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
+        result = ERROR(memory_allocation);
+        goto _cleanup;
+    }
+    if (minRatio < MINRATIO) minRatio = MINRATIO;
+    memset(doneMarks, 0, bufferSize+16);
+
+    /* limit sample set size (divsufsort limitation)*/
+    if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
+    while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
+
+    /* sort */
+    DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
+    {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
+        if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
+    }
+    suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
+    suffix0[0] = (int)bufferSize;           /* leads into noise */
+    /* build reverse suffix sort */
+    {   size_t pos;
+        for (pos=0; pos < bufferSize; pos++)
+            reverseSuffix[suffix[pos]] = (U32)pos;
+        /* note filePos tracks borders between samples.
+           It's not used at this stage, but planned to become useful in a later update */
+        filePos[0] = 0;
+        for (pos=1; pos<nbFiles; pos++)
+            filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
+    }
+
+    DISPLAYLEVEL(2, "finding patterns ... \n");
+    DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
+
+    {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
+            dictItem solution;
+            if (doneMarks[cursor]) { cursor++; continue; }
+            solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
+            if (solution.length==0) { cursor++; continue; }
+            ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
+            cursor += solution.length;
+            DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
+    }   }
+
+_cleanup:
+    free(suffix0);
+    free(reverseSuffix);
+    free(doneMarks);
+    free(filePos);
+    return result;
+}
+
+
+static void ZDICT_fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;;
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+typedef struct
+{
+    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CCtx* zc;     /* working context */
+    void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
+} EStats_ress_t;
+
+#define MAXREPOFFSET 1024
+
+static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
+                              U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
+                              const void* src, size_t srcSize,
+                              U32 notificationLevel)
+{
+    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
+    size_t cSize;
+
+    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
+    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    }
+    cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
+    if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
+
+    if (cSize) {  /* if == 0; block is not compressible */
+        const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
+
+        /* literals stats */
+        {   const BYTE* bytePtr;
+            for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
+                countLit[*bytePtr]++;
+        }
+
+        /* seqStats */
+        {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+            ZSTD_seqToCodes(seqStorePtr);
+
+            {   const BYTE* codePtr = seqStorePtr->ofCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->mlCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->llCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
+            }
+
+            if (nbSeq >= 2) { /* rep offsets */
+                const seqDef* const seq = seqStorePtr->sequencesStart;
+                U32 offset1 = seq[0].offset - 3;
+                U32 offset2 = seq[1].offset - 3;
+                if (offset1 >= MAXREPOFFSET) offset1 = 0;
+                if (offset2 >= MAXREPOFFSET) offset2 = 0;
+                repOffsets[offset1] += 3;
+                repOffsets[offset2] += 1;
+    }   }   }
+}
+
+static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
+{
+    size_t total=0;
+    unsigned u;
+    for (u=0; u<nbFiles; u++) total += fileSizes[u];
+    return total;
+}
+
+typedef struct { U32 offset; U32 count; } offsetCount_t;
+
+static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
+{
+    U32 u;
+    table[ZSTD_REP_NUM].offset = val;
+    table[ZSTD_REP_NUM].count = count;
+    for (u=ZSTD_REP_NUM; u>0; u--) {
+        offsetCount_t tmp;
+        if (table[u-1].count >= table[u].count) break;
+        tmp = table[u-1];
+        table[u-1] = table[u];
+        table[u] = tmp;
+    }
+}
+
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}
+
+#define OFFCODE_MAX 30  /* only applicable to first block */
+static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
+                                   unsigned compressionLevel,
+                             const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
+                             const void* dictBuffer, size_t  dictBufferSize,
+                                   unsigned notificationLevel)
+{
+    U32 countLit[256];
+    HUF_CREATE_STATIC_CTABLE(hufTable, 255);
+    U32 offcodeCount[OFFCODE_MAX+1];
+    short offcodeNCount[OFFCODE_MAX+1];
+    U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
+    U32 matchLengthCount[MaxML+1];
+    short matchLengthNCount[MaxML+1];
+    U32 litLengthCount[MaxLL+1];
+    short litLengthNCount[MaxLL+1];
+    U32 repOffset[MAXREPOFFSET];
+    offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
+    EStats_ress_t esr;
+    ZSTD_parameters params;
+    U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
+    size_t pos = 0, errorCode;
+    size_t eSize = 0;
+    size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
+    size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
+    BYTE* dstPtr = (BYTE*)dstBuffer;
+
+    /* init */
+    DEBUGLOG(4, "ZDICT_analyzeEntropy");
+    esr.ref = ZSTD_createCCtx();
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.ref || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
+    if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
+    for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
+    for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+    for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
+    memset(repOffset, 0, sizeof(repOffset));
+    repOffset[1] = repOffset[4] = repOffset[8] = 1;
+    memset(bestRepOffset, 0, sizeof(bestRepOffset));
+    if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
+    params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
+    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
+        if (ZSTD_isError(beginResult)) {
+            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
+            eSize = ERROR(GENERIC);
+            goto _cleanup;
+    }   }
+
+    /* collect stats on all samples */
+    for (u=0; u<nbFiles; u++) {
+        ZDICT_countEStats(esr, params,
+                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
+                         (const char*)srcBuffer + pos, fileSizes[u],
+                          notificationLevel);
+        pos += fileSizes[u];
+    }
+
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
+    }
+
+    /* looking for most common first offsets */
+    {   U32 offset;
+        for (offset=1; offset<MAXREPOFFSET; offset++)
+            ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
+    }
+    /* note : the result of this phase should be used to better appreciate the impact on statistics */
+
+    total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
+    errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
+        goto _cleanup;
+    }
+    Offlog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
+    errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
+        goto _cleanup;
+    }
+    mlLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
+    errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
+        goto _cleanup;
+    }
+    llLog = (U32)errorCode;
+
+    /* write result to buffer */
+    {   size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
+        if (HUF_isError(hhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "HUF_writeCTable error \n");
+            goto _cleanup;
+        }
+        dstPtr += hhSize;
+        maxDstSize -= hhSize;
+        eSize += hhSize;
+    }
+
+    {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
+        if (FSE_isError(ohSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += ohSize;
+        maxDstSize -= ohSize;
+        eSize += ohSize;
+    }
+
+    {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
+        if (FSE_isError(mhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += mhSize;
+        maxDstSize -= mhSize;
+        eSize += mhSize;
+    }
+
+    {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
+        if (FSE_isError(lhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += lhSize;
+        maxDstSize -= lhSize;
+        eSize += lhSize;
+    }
+
+    if (maxDstSize<12) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
+        goto _cleanup;
+    }
+# if 0
+    MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
+    MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
+    MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
+#else
+    /* at this stage, we don't use the result of "most common first offset",
+       as the impact of statistics is not properly evaluated */
+    MEM_writeLE32(dstPtr+0, repStartValue[0]);
+    MEM_writeLE32(dstPtr+4, repStartValue[1]);
+    MEM_writeLE32(dstPtr+8, repStartValue[2]);
+#endif
+    eSize += 12;
+
+_cleanup:
+    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCCtx(esr.zc);
+    free(esr.workPlace);
+
+    return eSize;
+}
+
+
+
+size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                          const void* customDictContent, size_t dictContentSize,
+                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                          ZDICT_params_t params)
+{
+    size_t hSize;
+#define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
+    BYTE header[HBUFFSIZE];
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+
+    /* check conditions */
+    DEBUGLOG(4, "ZDICT_finalizeDictionary");
+    if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
+    if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
+
+    /* dictionary header */
+    MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32(header+4, dictID);
+    }
+    hSize = 8;
+
+    /* entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  customDictContent, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* copy elements in final buffer ; note : src and dst buffer can overlap */
+    if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
+    {   size_t const dictSize = hSize + dictContentSize;
+        char* dictEnd = (char*)dictBuffer + dictSize;
+        memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
+        memcpy(dictBuffer, header, hSize);
+        return dictSize;
+    }
+}
+
+
+size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                                 ZDICT_params_t params)
+{
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+    size_t hSize = 8;
+
+    /* calculate entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* add dictionary header (after entropy tables) */
+    MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32((char*)dictBuffer+4, dictID);
+    }
+
+    if (hSize + dictContentSize < dictBufferCapacity)
+        memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
+    return MIN(dictBufferCapacity, hSize+dictContentSize);
+}
+
+
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
+*   Warning : `samplesBuffer` must be followed by noisy guard band.
+*   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
+*/
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params)
+{
+    U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
+    dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
+    unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
+    size_t const targetDictSize = maxDictSize;
+    size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    size_t dictSize = 0;
+    U32 const notificationLevel = params.zParams.notificationLevel;
+
+    /* checks */
+    if (!dictList) return ERROR(memory_allocation);
+    if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
+
+    /* init */
+    ZDICT_initDictItem(dictList);
+
+    /* build dictionary */
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
+
+    /* display best matches */
+    if (params.zParams.notificationLevel>= 3) {
+        U32 const nb = MIN(25, dictList[0].pos);
+        U32 const dictContentSize = ZDICT_dictSize(dictList);
+        U32 u;
+        DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
+        DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
+        for (u=1; u<nb; u++) {
+            U32 const pos = dictList[u].pos;
+            U32 const length = dictList[u].length;
+            U32 const printedLength = MIN(40, length);
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+                return ERROR(GENERIC);   /* should never happen */
+            DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+                         u, length, pos, dictList[u].savings);
+            ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
+            DISPLAYLEVEL(3, "| \n");
+    }   }
+
+
+    /* create dictionary */
+    {   U32 dictContentSize = ZDICT_dictSize(dictList);
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
+        if (dictContentSize < targetDictSize/4) {
+            DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+            if (samplesBuffSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
+            if (minRep > MINRATIO) {
+                DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+                DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+            }
+        }
+
+        if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
+            U32 proposedSelectivity = selectivity-1;
+            while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
+            DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
+            DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
+            DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
+        }
+
+        /* limit dictionary size */
+        {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
+            U32 currentSize = 0;
+            U32 n; for (n=1; n<max; n++) {
+                currentSize += dictList[n].length;
+                if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
+            }
+            dictList->pos = n;
+            dictContentSize = currentSize;
+        }
+
+        /* build dict content */
+        {   U32 u;
+            BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
+            for (u=1; u<dictList->pos; u++) {
+                U32 l = dictList[u].length;
+                ptr -= l;
+                if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
+                memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }   }
+
+        dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
+                                                             samplesBuffer, samplesSizes, nbSamples,
+                                                             params.zParams);
+    }
+
+    /* clean up */
+    free(dictList);
+    return dictSize;
+}
+
+
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
+{
+    size_t result;
+    void* newBuff;
+    size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
+
+    newBuff = malloc(sBuffSize + NOISELENGTH);
+    if (!newBuff) return ERROR(memory_allocation);
+
+    memcpy(newBuff, samplesBuffer, sBuffSize);
+    ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
+    free(newBuff);
+    return result;
+}
+
+
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_cover_params_t params;
+    DEBUGLOG(3, "ZDICT_trainFromBuffer");
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    /* Default to level 6 since no compression level information is available */
+    params.zParams.compressionLevel = 6;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
+    params.zParams.notificationLevel = DEBUGLEVEL;
+#endif
+    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes, nbSamples,
+                                               &params);
+}
+
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
+                                                     samplesBuffer, samplesSizes, nbSamples,
+                                                     params);
+}
diff --git a/deps/SZ/zstd/dictBuilder/zdict.h b/deps/SZ/zstd/dictBuilder/zdict.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad459c2d7d532b6c7bf0dc924a2bd5146a26c344
--- /dev/null
+++ b/deps/SZ/zstd/dictBuilder/zdict.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef DICTBUILDER_H_001
+#define DICTBUILDER_H_001
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*======  Dependencies  ======*/
+#include <stddef.h>  /* size_t */
+
+
+/* =====   ZDICTLIB_API : control library symbols visibility   ===== */
+#ifndef ZDICTLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZDICTLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZDICTLIB_API ZDICTLIB_VISIBILITY
+#endif
+
+
+/*! ZDICT_trainFromBuffer():
+ *  Train a dictionary from an array of samples.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+/*======   Helper functions   ======*/
+ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);  /**< extracts dictID; @return zero if error (not a valid dictionary) */
+ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
+ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
+
+
+
+#ifdef ZDICT_STATIC_LINKING_ONLY
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+typedef struct {
+    int      compressionLevel;   /* optimize for a specific zstd compression level; 0 means default */
+    unsigned notificationLevel;  /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned dictID;             /* force dictID value; 0 means auto mode (32-bits random value) */
+} ZDICT_params_t;
+
+/*! ZDICT_cover_params_t:
+ *  k and d are the only required parameters.
+ *  For others, value 0 means default.
+ */
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    ZDICT_params_t zParams;
+} ZDICT_cover_params_t;
+
+
+/*! ZDICT_trainFromBuffer_cover():
+ *  Train a dictionary from an array of samples using the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+          void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_cover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations and picks the best parameters.
+ * `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ *
+ * All of the parameters d, k, steps are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+          void* dictBuffer, size_t dictBufferCapacity,
+    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t* parameters);
+
+/*! ZDICT_finalizeDictionary():
+ * Given a custom content as a basis for dictionary, and a set of samples,
+ * finalize dictionary by adding headers and statistics.
+ *
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
+ *
+ * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+ * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
+ *           or an error code, which can be tested by ZDICT_isError().
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
+ * Note 2: dictBuffer and dictContent can overlap
+ */
+#define ZDICT_CONTENTSIZE_MIN 128
+#define ZDICT_DICTSIZE_MIN    256
+ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* dictContent, size_t dictContentSize,
+                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_params_t parameters);
+
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
+} ZDICT_legacy_params_t;
+
+/*! ZDICT_trainFromBuffer_legacy():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ *  Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_legacy_params_t parameters);
+
+/* Deprecation warnings */
+/* It is generally possible to disable deprecation warnings from compiler,
+   for example with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
+#  define ZDICT_DEPRECATED(message) ZDICTLIB_API   /* disable deprecation warnings */
+#else
+#  define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
+#  elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
+#  elif (ZDICT_GCC_VERSION >= 301)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API
+#  endif
+#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
+
+ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+#endif   /* ZDICT_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* DICTBUILDER_H_001 */
diff --git a/deps/SZ/zstd/legacy/zstd_legacy.h b/deps/SZ/zstd/legacy/zstd_legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..5893cb9657e62be90e8dcbda5176197dc253159f
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_legacy.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LEGACY_H
+#define ZSTD_LEGACY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include "mem.h"            /* MEM_STATIC */
+#include "error_private.h"  /* ERROR */
+#include "zstd.h"           /* ZSTD_inBuffer, ZSTD_outBuffer */
+
+#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0)
+#  undef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 8
+#endif
+
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+#  include "zstd_v01.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+#  include "zstd_v02.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+#  include "zstd_v03.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+#  include "zstd_v04.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+#  include "zstd_v05.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+#  include "zstd_v06.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+#  include "zstd_v07.h"
+#endif
+
+/** ZSTD_isLegacy() :
+    @return : > 0 if supported by legacy decoder. 0 otherwise.
+              return value is the version.
+*/
+MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize)
+{
+    U32 magicNumberLE;
+    if (srcSize<4) return 0;
+    magicNumberLE = MEM_readLE32(src);
+    switch(magicNumberLE)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case ZSTDv01_magicNumberLE:return 1;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case ZSTDv02_magicNumber : return 2;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case ZSTDv03_magicNumber : return 3;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case ZSTDv04_magicNumber : return 4;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case ZSTDv05_MAGICNUMBER : return 5;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case ZSTDv06_MAGICNUMBER : return 6;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case ZSTDv07_MAGICNUMBER : return 7;
+#endif
+        default : return 0;
+    }
+}
+
+
+MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize)
+{
+    U32 const version = ZSTD_isLegacy(src, srcSize);
+    if (version < 5) return 0;  /* no decompressed size in frame header, or not a legacy format */
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+    if (version==5) {
+        ZSTDv05_parameters fParams;
+        size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.srcSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+    if (version==6) {
+        ZSTDv06_frameParams fParams;
+        size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+    if (version==7) {
+        ZSTDv07_frameParams fParams;
+        size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+    return 0;   /* should not be possible */
+}
+
+
+MEM_STATIC size_t ZSTD_decompressLegacy(
+                     void* dst, size_t dstCapacity,
+               const void* src, size_t compressedSize,
+               const void* dict,size_t dictSize)
+{
+    U32 const version = ZSTD_isLegacy(src, compressedSize);
+    (void)dst; (void)dstCapacity; (void)dict; (void)dictSize;  /* unused when ZSTD_LEGACY_SUPPORT >= 8 */
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {   size_t result;
+                ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv05_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {   size_t result;
+                ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv06_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {   size_t result;
+                ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv07_freeDCtx(zd);
+                return result;
+            }
+#endif
+        default :
+            return ERROR(prefix_unknown);
+    }
+}
+
+MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src,
+                                             size_t compressedSize)
+{
+    U32 const version = ZSTD_isLegacy(src, compressedSize);
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            return ZSTDv01_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            return ZSTDv02_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            return ZSTDv03_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            return ZSTDv04_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            return ZSTDv05_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            return ZSTDv06_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            return ZSTDv07_findFrameCompressedSize(src, compressedSize);
+#endif
+        default :
+            return ERROR(prefix_unknown);
+    }
+}
+
+MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
+{
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext);
+#endif
+    }
+}
+
+
+MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
+                                        const void* dict, size_t dictSize)
+{
+    DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
+    if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
+    switch(newVersion)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)dict; (void)dictSize;
+            return 0;
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+        {
+            ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv04_decompressInit(dctx);
+            ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+        {
+            ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+        {
+            ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+        {
+            ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+    }
+}
+
+
+
+MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
+                                              ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext; (void)output; (void)input;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            {
+                ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {
+                ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {
+                ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {
+                ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+    }
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_LEGACY_H */
diff --git a/deps/SZ/zstd/legacy/zstd_v01.c b/deps/SZ/zstd/legacy/zstd_v01.c
new file mode 100644
index 0000000000000000000000000000000000000000..ae1cb2ce5aa068c70db9f0d20a37b761e9716e84
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v01.c
@@ -0,0 +1,2127 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v01.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* You can statically allocate Huff0 DTable as a table of unsigned short using below macro */
+#define HUF_DTABLE_SIZE_U16(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUF_CREATE_STATIC_DTABLE(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE_U16(maxTableLog)] = { maxTableLog }
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define FSE_LIST_ERRORS(ITEM) \
+        ITEM(FSE_OK_NoError) ITEM(FSE_ERROR_GENERIC) \
+        ITEM(FSE_ERROR_tableLog_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooSmall) \
+        ITEM(FSE_ERROR_dstSize_tooSmall) ITEM(FSE_ERROR_srcSize_wrong)\
+        ITEM(FSE_ERROR_corruptionDetected) \
+        ITEM(FSE_ERROR_maxCode)
+
+#define FSE_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { FSE_LIST_ERRORS(FSE_GENERATE_ENUM) } FSE_errorCodes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+
+/******************************************
+*  FSE symbol compression API
+******************************************/
+/*
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   You will want to enable link-time-optimization to ensure these functions are properly inlined in your binary.
+   Visual seems to do it automatically.
+   For gcc or clang, you'll need to add -flto flag at compilation and linking stages.
+   If none of these solutions is applicable, include "fse.c" directly.
+*/
+
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+typedef struct
+{
+    size_t bitContainer;
+    int    bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} FSE_CStream_t;
+
+typedef struct
+{
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} FSE_DStream_t;
+
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+typedef enum { FSE_DStream_unfinished = 0,
+               FSE_DStream_endOfBuffer = 1,
+               FSE_DStream_completed = 2,
+               FSE_DStream_tooFar = 3 } FSE_DStream_status;  /* result of FSE_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... ?! */
+
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+#ifndef MEM_ACCESS_MODULE
+#define MEM_ACCESS_MODULE
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+typedef  uint8_t BYTE;
+typedef uint16_t U16;
+typedef  int16_t S16;
+typedef uint32_t U32;
+typedef  int32_t S32;
+typedef uint64_t U64;
+typedef  int64_t S64;
+#else
+typedef unsigned char       BYTE;
+typedef unsigned short      U16;
+typedef   signed short      S16;
+typedef unsigned int        U32;
+typedef   signed int        S32;
+typedef unsigned long long  U64;
+typedef   signed long long  S64;
+#endif
+
+#endif   /* MEM_ACCESS_MODULE */
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* FSE_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef FSE_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define FSE_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define FSE_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+
+static unsigned FSE_32bits(void)
+{
+    return sizeof(void*)==4;
+}
+
+static unsigned FSE_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==2)
+
+static U16 FSE_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 FSE_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+static U16 FSE_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 FSE_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U64 FSE_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+#endif // FSE_FORCE_MEMORY_ACCESS
+
+static U16 FSE_readLE16(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+static U32 FSE_readLE32(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+static U64 FSE_readLE64(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+static size_t FSE_readLEST(const void* memPtr)
+{
+    if (FSE_32bits())
+        return (size_t)FSE_readLE32(memPtr);
+    else
+        return (size_t)FSE_readLE64(memPtr);
+}
+
+
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef struct
+{
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+/****************************************************************
+*  Internal functions
+****************************************************************/
+FORCE_INLINE unsigned FSE_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304)   /* GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)(ptr) + 1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_maxSymbolValue_tooLarge;
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+
+    /* Init, lay down lowprob symbols */
+    DTableH[0].tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH->fastMode = (U16)noLarge;
+    return 0;
+}
+
+
+/******************************************
+*  FSE byte symbol
+******************************************/
+#ifndef FSE_COMMONDEFS_ONLY
+
+static unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
+
+static short FSE_abs(short a)
+{
+    return a<0? -a : a;
+}
+
+
+/****************************************************************
+*  Header bitstream management
+****************************************************************/
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return (size_t)-FSE_ERROR_srcSize_wrong;
+    bitStream = FSE_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = FSE_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = FSE_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = FSE_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+
+/* FSE_initDStream
+ * Initialize a FSE_DStream_t.
+ * srcBuffer must point at the beginning of an FSE block.
+ * The function result is the size of the FSE_block (== srcSize).
+ * If srcSize is too small, the function will return an errorCode;
+ */
+static size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    if (srcSize >=  sizeof(size_t))
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+/*!FSE_lookBits
+ * Provides next n bits from the bitContainer.
+ * bitContainer is not modified (bits are still present for next read/look)
+ * On 32-bits, maxNbBits==25
+ * On 64-bits, maxNbBits==57
+ * return : value extracted.
+ */
+static size_t FSE_lookBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+static size_t FSE_lookBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+static void FSE_skipBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+
+/*!FSE_readBits
+ * Read next n bits from the bitContainer.
+ * On 32-bits, don't read more than maxNbBits==25
+ * On 64-bits, don't read more than maxNbBits==57
+ * Use the fast variant *only* if n >= 1.
+ * return : value extracted.
+ */
+static size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = FSE_lookBits(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+static size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    size_t value = FSE_lookBitsFast(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+static unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return FSE_DStream_tooFar;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        return FSE_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_endOfBuffer;
+        return FSE_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        U32 result = FSE_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = FSE_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+
+static void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = FSE_readBits(bitD, DTableH->tableLog);
+    FSE_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+static BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+static BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/* FSE_endOfDStream
+   Tells if bitD has reached end of bitStream or not */
+
+static unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
+{
+    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8));
+}
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    FSE_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (FSE_reloadDStream(&bitD) > FSE_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : FSE_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly FSE_DStream_completed */
+    while (1)
+    {
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));   /* memcpy() into local variable, to avoid strict aliasing warning */
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+/* *******************************************************
+*  Huff0 : Huffman block compression
+*********************************************************/
+#define HUF_MAX_SYMBOL_VALUE 255
+#define HUF_DEFAULT_TABLELOG  12       /* used by default, when not specified */
+#define HUF_MAX_TABLELOG  12           /* max possible tableLog; for allocation purpose; can be modified */
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+typedef struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+} HUF_CElt ;
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* *******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct {
+    BYTE byte;
+    BYTE nbBits;
+} HUF_DElt;
+
+static size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];  /* large enough for values from 0 to 16 */
+    U32 weightTotal;
+    U32 maxBits;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DElt* const dt = (HUF_DElt*)ptr;
+
+    if (!srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    iSize = ip[0];
+
+    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* should not be necessary, but some analyzer complain ... */
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, sizeof(huffWeight));
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   /* max 255 values decoded, last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankVal, 0, sizeof(rankVal));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_corruptionDetected;
+        rankVal[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    maxBits = FSE_highbit32(weightTotal) + 1;
+    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_tableLog_tooLarge;   /* DTable is too small */
+    DTable[0] = (U16)maxBits;
+    {
+        U32 total = 1 << maxBits;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << FSE_highbit32(rest);
+        U32 lastWeight = FSE_highbit32(rest) + 1;
+        if (verif != rest) return (size_t)-FSE_ERROR_corruptionDetected;    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankVal[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankVal[1] < 2) || (rankVal[1] & 1)) return (size_t)-FSE_ERROR_corruptionDetected;   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=maxBits; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<=oSize; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DElt D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(maxBits + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize+1;
+}
+
+
+static BYTE HUF_decodeSymbol(FSE_DStream_t* Dstream, const HUF_DElt* dt, const U32 dtLog)
+{
+        const size_t val = FSE_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        FSE_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+static size_t HUF_decompress_usingDTable(   /* -3% slower when non static */
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-15;
+
+    const void* ptr = DTable;
+    const HUF_DElt* const dt = (const HUF_DElt*)(ptr)+1;
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+    U32 reloadStatus;
+
+    /* Init */
+
+    const U16* jumpTable = (const U16*)cSrc;
+    const size_t length1 = FSE_readLE16(jumpTable);
+    const size_t length2 = FSE_readLE16(jumpTable+1);
+    const size_t length3 = FSE_readLE16(jumpTable+2);
+    const size_t length4 = cSrcSize - 6 - length1 - length2 - length3;   // check coherency !!
+    const char* const start1 = (const char*)(cSrc) + 6;
+    const char* const start2 = start1 + length1;
+    const char* const start3 = start2 + length2;
+    const char* const start4 = start3 + length3;
+    FSE_DStream_t bitD1, bitD2, bitD3, bitD4;
+
+    if (length1+length2+length3+6 >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    errorCode = FSE_initDStream(&bitD1, start1, length1);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD2, start2, length2);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD3, start3, length3);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD4, start4, length4);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    reloadStatus=FSE_reloadDStream(&bitD2);
+
+    /* 16 symbols per loop */
+    for ( ; (reloadStatus<FSE_DStream_completed) && (op<olimit);  /* D2-3-4 are supposed to be synchronized and finish together */
+        op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
+    {
+#define HUF_DECODE_SYMBOL_0(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog);
+
+#define HUF_DECODE_SYMBOL_1(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits() && (HUF_MAX_TABLELOG>12)) FSE_reloadDStream(&Dstream)
+
+#define HUF_DECODE_SYMBOL_2(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits()) FSE_reloadDStream(&Dstream)
+
+        HUF_DECODE_SYMBOL_1( 0, bitD1);
+        HUF_DECODE_SYMBOL_1( 1, bitD2);
+        HUF_DECODE_SYMBOL_1( 2, bitD3);
+        HUF_DECODE_SYMBOL_1( 3, bitD4);
+        HUF_DECODE_SYMBOL_2( 4, bitD1);
+        HUF_DECODE_SYMBOL_2( 5, bitD2);
+        HUF_DECODE_SYMBOL_2( 6, bitD3);
+        HUF_DECODE_SYMBOL_2( 7, bitD4);
+        HUF_DECODE_SYMBOL_1( 8, bitD1);
+        HUF_DECODE_SYMBOL_1( 9, bitD2);
+        HUF_DECODE_SYMBOL_1(10, bitD3);
+        HUF_DECODE_SYMBOL_1(11, bitD4);
+        HUF_DECODE_SYMBOL_0(12, bitD1);
+        HUF_DECODE_SYMBOL_0(13, bitD2);
+        HUF_DECODE_SYMBOL_0(14, bitD3);
+        HUF_DECODE_SYMBOL_0(15, bitD4);
+    }
+
+    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be FSE_DStream_unfinished */
+        return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* tail */
+    {
+        // bitTail = bitD1;   // *much* slower : -20% !??!
+        FSE_DStream_t bitTail;
+        bitTail.ptr = bitD1.ptr;
+        bitTail.bitsConsumed = bitD1.bitsConsumed;
+        bitTail.bitContainer = bitD1.bitContainer;   // required in case of FSE_DStream_endOfBuffer
+        bitTail.start = start1;
+        for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
+        {
+            HUF_DECODE_SYMBOL_0(0, bitTail);
+        }
+
+        if (FSE_endOfDStream(&bitTail))
+            return op-ostart;
+    }
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+static size_t HUF_decompress (void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLE(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTable (DTable, cSrc, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/****************************************************************
+*  Tuning parameters
+*****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect */
+#define ZSTD_MEMORY_USAGE 17
+
+
+/**************************************
+   CPU Feature Detection
+**************************************/
+/*
+ * Automated efficient unaligned memory access detection
+ * Based on known hardware architectures
+ * This list will be updated thanks to feedbacks
+ */
+#if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \
+    || defined(__ARM_FEATURE_UNALIGNED) \
+    || defined(__i386__) || defined(__x86_64__) \
+    || defined(_M_IX86) || defined(_M_X64) \
+    || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__) \
+    || (defined(_M_ARM) && (_M_ARM >= 7))
+#  define ZSTD_UNALIGNED_ACCESS 1
+#else
+#  define ZSTD_UNALIGNED_ACCESS 0
+#endif
+
+
+/********************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/********************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+#ifndef MEM_ACCESS_MODULE
+#define MEM_ACCESS_MODULE
+/********************************************************
+*  Basic Types
+*********************************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+typedef  uint8_t BYTE;
+typedef uint16_t U16;
+typedef  int16_t S16;
+typedef uint32_t U32;
+typedef  int32_t S32;
+typedef uint64_t U64;
+#else
+typedef unsigned char       BYTE;
+typedef unsigned short      U16;
+typedef   signed short      S16;
+typedef unsigned int        U32;
+typedef   signed int        S32;
+typedef unsigned long long  U64;
+#endif
+
+#endif   /* MEM_ACCESS_MODULE */
+
+
+/********************************************************
+*  Constants
+*********************************************************/
+static const U32 ZSTD_magicNumber = 0xFD2FB51E;   /* 3rd version : seqNb header */
+
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff ((1<<Offbits)-1)
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/********************************************************
+*  Memory operations
+*********************************************************/
+static unsigned ZSTD_32bits(void) { return sizeof(void*)==4; }
+
+static unsigned ZSTD_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+static U16    ZSTD_read16(const void* p) { U16 r; memcpy(&r, p, sizeof(r)); return r; }
+
+static U32    ZSTD_read32(const void* p) { U32 r; memcpy(&r, p, sizeof(r)); return r; }
+
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s)    { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    while (op < oend) COPY8(op, ip);
+}
+
+static U16 ZSTD_readLE16(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian()) return ZSTD_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + ((U16)p[1]<<8));
+    }
+}
+
+
+static U32 ZSTD_readLE32(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian())
+        return ZSTD_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+static U32 ZSTD_readBE32(const void* memPtr)
+{
+    const BYTE* p = (const BYTE*)memPtr;
+    return (U32)(((U32)p[0]<<24) + ((U32)p[1]<<16) + ((U32)p[2]<<8) + ((U32)p[3]<<0));
+}
+
+
+/**************************************
+*  Local structures
+***************************************/
+typedef struct ZSTD_Cctx_s ZSTD_Cctx;
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+typedef struct ZSTD_Cctx_s
+{
+    const BYTE* base;
+    U32 current;
+    U32 nextUpdate;
+    seqStore_t seqStore;
+#ifdef __AVX2__
+    __m256i hashTable[HASH_TABLESIZE>>3];
+#else
+    U32 hashTable[HASH_TABLESIZE];
+#endif
+    BYTE buffer[WORKPLACESIZE];
+} cctxi_t;
+
+
+
+
+/**************************************
+*  Error Management
+**************************************/
+/* published entry point */
+unsigned ZSTDv01_isError(size_t code) { return ERR_isError(code); }
+
+
+/**************************************
+*  Tool functions
+**************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    1    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+/**************************************************************
+*   Decompression code
+**************************************************************/
+
+size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+static size_t ZSTD_decompressLiterals(void* ctx,
+                                      void* dst, size_t maxDstSize,
+                                const void* src, size_t srcSize)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + maxDstSize;
+    const BYTE* ip = (const BYTE*)src;
+    size_t errorCode;
+    size_t litSize;
+
+    /* check : minimum 2, for litSize, +1, for content */
+    if (srcSize <= 3) return ERROR(corruption_detected);
+
+    litSize = ip[1] + (ip[0]<<8);
+    litSize += ((ip[-3] >> 3) & 7) << 16;   // mmmmh....
+    op = oend - litSize;
+
+    (void)ctx;
+    if (litSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
+    if (FSE_isError(errorCode)) return ERROR(GENERIC);
+    return litSize;
+}
+
+
+size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
+                                void* dst, size_t maxDstSize,
+                          const BYTE** litStart, size_t* litSize,
+                          const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    blockProperties_t litbp;
+
+    size_t litcSize = ZSTDv01_getcBlockSize(src, srcSize, &litbp);
+    if (ZSTDv01_isError(litcSize)) return litcSize;
+    if (litcSize > srcSize - ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    ip += ZSTD_blockHeaderSize;
+
+    switch(litbp.blockType)
+    {
+    case bt_raw:
+        *litStart = ip;
+        ip += litcSize;
+        *litSize = litcSize;
+        break;
+    case bt_rle:
+        {
+            size_t rleSize = litbp.origSize;
+            if (rleSize>maxDstSize) return ERROR(dstSize_tooSmall);
+            if (!srcSize) return ERROR(srcSize_wrong);
+            memset(oend - rleSize, *ip, rleSize);
+            *litStart = oend - rleSize;
+            *litSize = rleSize;
+            ip++;
+            break;
+        }
+    case bt_compressed:
+        {
+            size_t decodedLitSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
+            if (ZSTDv01_isError(decodedLitSize)) return decodedLitSize;
+            *litStart = oend - decodedLitSize;
+            *litSize = decodedLitSize;
+            ip += litcSize;
+            break;
+        }
+    case bt_end:
+    default:
+        return ERROR(GENERIC);
+    }
+
+    return ip-istart;
+}
+
+
+size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = ZSTD_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++); break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    FSE_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = dumps<de ? *dumps++ : 0;
+        if (add < 255) litLength += add;
+        else
+        {
+            if (dumps<=(de-3))
+            {
+                litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+                dumps += 3;
+            }
+        }
+    }
+
+    /* Offset */
+    {
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = ((size_t)1 << (nbBits & ((sizeof(offset)*8)-1))) + FSE_readBits(&(seqState->DStream), nbBits);
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = dumps<de ? *dumps++ : 0;
+        if (add < 255) matchLength += add;
+        else
+        {
+            if (dumps<=(de-3))
+            {
+                matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+                dumps += 3;
+            }
+        }
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    const size_t litLength = sequence.litLength;
+    BYTE* const endMatch = op + litLength + sequence.matchLength;    /* risk : address space overflow (32-bits) */
+    const BYTE* const litEnd = *litPtr + litLength;
+
+    /* check */
+    if (endMatch > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);
+    if (sequence.matchLength > (size_t)(*litPtr-op))  return ERROR(dstSize_tooSmall);    /* overwrite literal segment */
+
+    /* copy Literals */
+    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
+        memmove(op, *litPtr, litLength);   /* overwrite risk */
+    else
+        ZSTD_wildcopy(op, *litPtr, litLength);
+    op += litLength;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* check : last match must be at a minimum distance of 8 from end of dest buffer */
+    if (oend-op < 8) return ERROR(dstSize_tooSmall);
+
+    /* copy Match */
+    {
+        const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
+        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
+        size_t qutt = 12;
+        U64 saved[2];
+
+        /* check */
+        if (match < base) return ERROR(corruption_detected);
+        if (sequence.offset > (size_t)base) return ERROR(corruption_detected);
+
+        /* save beginning of literal sequence, in case of write overlap */
+        if (overlapRisk)
+        {
+            if ((endMatch + qutt) > oend) qutt = oend-endMatch;
+            memcpy(saved, endMatch, qutt);
+        }
+
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        } else { ZSTD_copy8(op, match); }
+        op += 8; match += 8;
+
+        if (endMatch > oend-(16-MINMATCH))
+        {
+            if (op < oend-8)
+            {
+                ZSTD_wildcopy(op, match, (oend-8) - op);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<endMatch) *op++ = *match++;
+        }
+        else
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+
+        /* restore, in case of overlap */
+        if (overlapRisk) memcpy(endMatch, saved, qutt);
+    }
+
+    return endMatch-ostart;
+}
+
+typedef struct ZSTDv01_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+} dctx_t;
+
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize,
+                         const BYTE* litStart, size_t litSize)
+{
+    dctx_t* dctx = (dctx_t*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = litStart;
+    const BYTE* const litEnd = litStart + litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTDv01_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTDv01_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 1;
+        errorCode = FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (FSE_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (FSE_reloadDStream(&(seqState.DStream)) <= FSE_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTDv01_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !FSE_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed, srcSize is trusted */
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* litPtr = NULL;
+    size_t litSize = 0;
+    size_t errorCode;
+
+    /* Decode literals sub-block */
+    errorCode = ZSTDv01_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, &litSize, src, srcSize);
+    if (ZSTDv01_isError(errorCode)) return errorCode;
+    ip += errorCode;
+    srcSize -= errorCode;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize, litPtr, litSize);
+}
+
+
+size_t ZSTDv01_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    size_t errorCode=0;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = ZSTD_readBE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t blockSize = ZSTDv01_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (blockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            errorCode = ZSTD_decompressBlock(ctx, op, oend-op, ip, blockSize);
+            break;
+        case bt_raw :
+            errorCode = ZSTD_copyUncompressedBlock(op, oend-op, ip, blockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        if (blockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv01_isError(errorCode)) return errorCode;
+        op += errorCode;
+        ip += blockSize;
+        remainingSize -= blockSize;
+    }
+
+    return op-ostart;
+}
+
+size_t ZSTDv01_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    dctx_t ctx;
+    ctx.base = dst;
+    return ZSTDv01_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+size_t ZSTDv01_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = ZSTD_readBE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t blockSize = ZSTDv01_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (blockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (blockSize == 0) break;   /* bt_end */
+
+        ip += blockSize;
+        remainingSize -= blockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void)
+{
+    ZSTDv01_Dctx* dctx = (ZSTDv01_Dctx*)malloc(sizeof(ZSTDv01_Dctx));
+    if (dctx==NULL) return NULL;
+    ZSTDv01_resetDCtx(dctx);
+    return dctx;
+}
+
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx)
+{
+    return ((dctx_t*)dctx)->expected;
+}
+
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    dctx_t* ctx = (dctx_t*)dctx;
+
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = ZSTD_readBE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTDv01_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}
diff --git a/deps/SZ/zstd/legacy/zstd_v01.h b/deps/SZ/zstd/legacy/zstd_v01.h
new file mode 100644
index 0000000000000000000000000000000000000000..42f0897c7d2337cc8505d761b5b082423fc9d1eb
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v01.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V01_H_28739879432
+#define ZSTD_V01_H_28739879432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv01_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.1.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error
+*/
+unsigned ZSTDv01_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx;
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void);
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx);
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv01_magicNumber   0xFD2FB51E   /* Big Endian version */
+#define ZSTDv01_magicNumberLE 0x1EB52FFD   /* Little Endian version */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V01_H_28739879432 */
diff --git a/deps/SZ/zstd/legacy/zstd_v02.c b/deps/SZ/zstd/legacy/zstd_v02.c
new file mode 100644
index 0000000000000000000000000000000000000000..8bc0eceeda8fab391f378f51b0005eb3c9d66247
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v02.c
@@ -0,0 +1,3483 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v02.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   Error codes and messages
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#elif defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define PREFIX(name) ZSTD_error_##name
+
+#define ERROR(name) (size_t)-PREFIX(name)
+
+#define ERROR_LIST(ITEM) \
+        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
+        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
+        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
+        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
+        ITEM(PREFIX(maxCode))
+
+#define ERROR_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+#define ERROR_CONVERTTOSTRING(STRING) #STRING,
+#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
+static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) };
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    if (ERR_isError(code)) return ERR_strings[-(int)(code)];
+    return codeError;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/*
+Constructor and Destructor of type FSE_CTable
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/******************************************
+*  FSE advanced API
+******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+/******************************************
+*  FSE symbol decompression API
+******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/******************************************
+*  FSE unsafe API
+******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/******************************************
+*  Implementation of inline functions
+******************************************/
+
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Static allocation macros
+******************************************/
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/******************************************
+*  Advanced functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* quad-symbols decoder */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/*
+    zstd - standard compression library
+    Header File
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Version
+***************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    2    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+
+#if defined (__cplusplus)
+}
+#endif
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Streaming functions
+***************************************/
+
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+
+/*
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTD_magicNumber 0xFD2FB522   /* v0.2 (current)*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt+1;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr;
+    FSE_DTableHeader DTableH;
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));   /* memcpy(), to avoid strict aliasing warnings */
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return (short)(a<0 ? -a : a);
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/******************************************
+*  Helper functions
+******************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)ptr;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    void* ptr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        {if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* quad-symbol decoding           */
+/**********************************/
+typedef struct { BYTE nbBits; BYTE nbBytes; } HUF_DDescX6;
+typedef union { BYTE byte[4]; U32 sequence; } HUF_DSeqX6;
+
+/* recursive, up to level 3; may benefit from <template>-like strategy to nest each level inline */
+static void HUF_fillDTableX6LevelN(HUF_DDescX6* DDescription, HUF_DSeqX6* DSequence, int sizeLog,
+                           const rankVal_t rankValOrigin, const U32 consumed, const int minWeight, const U32 maxWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, const U32* rankStart,
+                           const U32 nbBitsBaseline, HUF_DSeqX6 baseSeq, HUF_DDescX6 DDesc)
+{
+    const int scaleLog = nbBitsBaseline - sizeLog;   /* note : targetLog >= (nbBitsBaseline-1), hence scaleLog <= 1 */
+    const int minBits  = nbBitsBaseline - maxWeight;
+    const U32 level = DDesc.nbBytes;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 symbolStartPos, s;
+
+    /* local rankVal, will be modified */
+    memcpy(rankVal, rankValOrigin[consumed], sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i;
+        const U32 skipSize = rankVal[minWeight];
+        for (i = 0; i < skipSize; i++)
+        {
+            DSequence[i] = baseSeq;
+            DDescription[i] = DDesc;
+        }
+    }
+
+    /* fill DTable */
+    DDesc.nbBytes++;
+    symbolStartPos = rankStart[minWeight];
+    for (s=symbolStartPos; s<sortedListSize; s++)
+    {
+        const BYTE symbol = sortedSymbols[s].symbol;
+        const U32  weight = sortedSymbols[s].weight;   /* >= 1 (sorted) */
+        const int  nbBits = nbBitsBaseline - weight;   /* >= 1 (by construction) */
+        const int  totalBits = consumed+nbBits;
+        const U32  start  = rankVal[weight];
+        const U32  length = 1 << (sizeLog-nbBits);
+        baseSeq.byte[level] = symbol;
+        DDesc.nbBits = (BYTE)totalBits;
+
+        if ((level<3) && (sizeLog-totalBits >= minBits))   /* enough room for another symbol */
+        {
+            int nextMinWeight = totalBits + scaleLog;
+            if (nextMinWeight < 1) nextMinWeight = 1;
+            HUF_fillDTableX6LevelN(DDescription+start, DSequence+start, sizeLog-nbBits,
+                           rankValOrigin, totalBits, nextMinWeight, maxWeight,
+                           sortedSymbols, sortedListSize, rankStart,
+                           nbBitsBaseline, baseSeq, DDesc);   /* recursive (max : level 3) */
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            for (i = start; i < end; i++)
+            {
+                DDescription[i] = DDesc;
+                DSequence[i] = baseSeq;
+            }
+        }
+        rankVal[weight] += length;
+    }
+}
+
+
+/* note : same preparation as X4 */
+static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    rankVal_t rankVal;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+
+    /* fill tables */
+    {
+        void* ptr = DTable+1;
+        HUF_DDescX6* DDescription = (HUF_DDescX6*)(ptr);
+        void* dSeqStart = DTable + 1 + ((size_t)1<<(memLog-1));
+        HUF_DSeqX6* DSequence = (HUF_DSeqX6*)(dSeqStart);
+        HUF_DSeqX6 DSeq;
+        HUF_DDescX6 DDesc;
+        DSeq.sequence = 0;
+        DDesc.nbBits = 0;
+        DDesc.nbBytes = 0;
+        HUF_fillDTableX6LevelN(DDescription, DSequence, memLog,
+                       (const U32 (*)[HUF_ABSOLUTEMAX_TABLELOG + 1])rankVal, 0, 1, maxW,
+                       sortedSymbol, sizeOfSort, rankStart0,
+                       tableLog+1, DSeq, DDesc);
+    }
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX6(void* op, BIT_DStream_t* DStream, const HUF_DDescX6* dd, const HUF_DSeqX6* ds, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, ds+val, sizeof(HUF_DSeqX6));
+    BIT_skipBits(DStream, dd[val].nbBits);
+    return dd[val].nbBytes;
+}
+
+static U32 HUF_decodeLastSymbolsX6(void* op, const U32 maxL, BIT_DStream_t* DStream,
+                                  const HUF_DDescX6* dd, const HUF_DSeqX6* ds, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    U32 length = dd[val].nbBytes;
+    if (length <= maxL)
+    {
+        memcpy(op, ds+val, length);
+        BIT_skipBits(DStream, dd[val].nbBits);
+        return length;
+    }
+    memcpy(op, ds+val, maxL);
+    if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+    {
+        BIT_skipBits(DStream, dd[val].nbBits);
+        if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+            DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }
+    return maxL;
+}
+
+
+#define HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX6(ptr, DStreamPtr, dd, ds, dtLog)
+
+#define HUF_DECODE_SYMBOLX6_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX6_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX6(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const U32* DTable, const U32 dtLog)
+{
+    const void* ddPtr = DTable+1;
+    const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
+    const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+    const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+    BYTE* const pStart = p;
+
+    /* up to 16 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-16))
+    {
+        HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);
+    }
+
+    /* closer to the end, up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);
+
+    while (p <= pEnd-4)
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    while (p < pEnd)
+        p += HUF_decodeLastSymbolsX6(p, (U32)(pEnd-p), bitDPtr, dd, ds, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X6_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const U32 dtLog = DTable[0];
+        const void* ddPtr = DTable+1;
+        const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
+        const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+        const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-64 symbols per loop (4-16 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (op3 <= opStart4) && (endSignal==BIT_DStream_unfinished) && (op4<=(oend-16)) ; )
+        {
+            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX6(op1, &bitD1, opStart2, DTable, dtLog);
+        HUF_decodeStreamX6(op2, &bitD2, opStart3, DTable, dtLog);
+        HUF_decodeStreamX6(op3, &bitD3, opStart4, DTable, dtLog);
+        HUF_decodeStreamX6(op4, &bitD4, oend,     DTable, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX6(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX6 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X6_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, HUF_decompress4X6 };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+    if (Dtime[2] < Dtime[algoNb]) algoNb = 2;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*/
+#define ZSTD_MEMORY_USAGE 17
+
+/*!
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0, fastest), or in memory heap (1, requires malloc())
+ * Note that compression context is fairly large, as a consequence heap memory is recommended.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif /* ZSTD_HEAPMODE */
+
+/*!
+*  LEGACY_SUPPORT :
+*  decompressor can decode older formats (starting from Zstd 0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/* *******************************************************
+*  Constants
+*********************************************************/
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff   31
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do COPY8(op, ip) while (op < oend);
+}
+
+
+/* **************************************
+*  Local structures
+****************************************/
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+struct ZSTD_DCtx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize )*/
+static size_t ZSTD_decodeLiteralsBlock(void* ctx,
+                          const void* src, size_t srcSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* const istart = (const BYTE* const)src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    default:
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;
+        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const size_t offsetPrefix[MaxOff+1] = {  /* note : size_t faster than U32 */
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    BYTE* const oLitEnd = op + sequence.litLength;
+    BYTE* const oMatchEnd = op + sequence.litLength + sequence.matchLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+
+    /* checks */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* overRead beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    {
+        const BYTE* match = op - sequence.offset;
+
+        /* check */
+        if (sequence.offset > (size_t)op) return ERROR(corruption_detected);   /* address space overflow test (this test seems kept by clang optimizer) */
+        //if (match > op) return ERROR(corruption_detected);   /* address space overflow test (is clang optimizer removing this test ?) */
+        if (match < base) return ERROR(corruption_detected);
+
+        /* close range match, overlap */
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        }
+        else
+        {
+            ZSTD_copy8(op, match);
+        }
+        op += 8; match += 8;
+
+        if (oMatchEnd > oend-(16-MINMATCH))
+        {
+            if (op < oend_8)
+            {
+                ZSTD_wildcopy(op, match, oend_8 - op);
+                match += oend_8 - op;
+                op = oend_8;
+            }
+            while (op < oMatchEnd) *op++ = *match++;
+        }
+        else
+        {
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        }
+    }
+
+    return oMatchEnd - ostart;
+}
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 1;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(ctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyUncompressedBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    ZSTD_DCtx ctx;
+    ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = MEM_readLE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTD_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}
+
+
+/* wrapper layer */
+
+unsigned ZSTDv02_isError(size_t code)
+{
+    return ZSTD_isError(code);
+}
+
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize)
+{
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+}
+
+size_t ZSTDv02_findFrameCompressedSize(const void *src, size_t compressedSize)
+{
+    return ZSTD_findFrameCompressedSize(src, compressedSize);
+}
+
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void)
+{
+    return (ZSTDv02_Dctx*)ZSTD_createDCtx();
+}
+
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+}
diff --git a/deps/SZ/zstd/legacy/zstd_v02.h b/deps/SZ/zstd/legacy/zstd_v02.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dde7a63773c5213233fd7740c0e73b1ca204100
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v02.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V02_H_4174539423
+#define ZSTD_V02_H_4174539423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv02_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.2.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv02_isError())
+*/
+size_t ZSTDv02_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error
+*/
+unsigned ZSTDv02_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx;
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void);
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx);
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv02_magicNumber 0xFD2FB522   /* v0.2 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V02_H_4174539423 */
diff --git a/deps/SZ/zstd/legacy/zstd_v03.c b/deps/SZ/zstd/legacy/zstd_v03.c
new file mode 100644
index 0000000000000000000000000000000000000000..54445af577edd91cf5b0f7fdf6cbb12a42458483
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v03.c
@@ -0,0 +1,3124 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v03.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   Error codes and messages
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#elif defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define PREFIX(name) ZSTD_error_##name
+
+#define ERROR(name) (size_t)-PREFIX(name)
+
+#define ERROR_LIST(ITEM) \
+        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
+        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
+        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
+        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
+        ITEM(PREFIX(maxCode))
+
+#define ERROR_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+#define ERROR_CONVERTTOSTRING(STRING) #STRING,
+#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
+static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) };
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    if (ERR_isError(code)) return ERR_strings[-(int)(code)];
+    return codeError;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/*
+Constructor and Destructor of type FSE_CTable
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/******************************************
+*  FSE advanced API
+******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+/******************************************
+*  FSE symbol decompression API
+******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/******************************************
+*  FSE unsafe API
+******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/******************************************
+*  Implementation of inline functions
+******************************************/
+
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Static allocation macros
+******************************************/
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/******************************************
+*  Advanced functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/*
+    zstd - standard compression library
+    Header File
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Version
+***************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    2    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+
+#if defined (__cplusplus)
+}
+#endif
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Streaming functions
+***************************************/
+
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+
+/*
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTD_magicNumber 0xFD2FB523   /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt+1;
+    FSE_DTableHeader DTableH;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr;
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return a<0 ? -a : a;
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/******************************************
+*  Helper functions
+******************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)(ptr);
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    void* ptr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*/
+#define ZSTD_MEMORY_USAGE 17
+
+/*!
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0, fastest), or in memory heap (1, requires malloc())
+ * Note that compression context is fairly large, as a consequence heap memory is recommended.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif /* ZSTD_HEAPMODE */
+
+/*!
+*  LEGACY_SUPPORT :
+*  decompressor can decode older formats (starting from Zstd 0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#else
+#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#endif
+
+
+/* *******************************************************
+*  Constants
+*********************************************************/
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff   31
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do COPY8(op, ip) while (op < oend);
+}
+
+
+/* **************************************
+*  Local structures
+****************************************/
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+struct ZSTD_DCtx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize )*/
+static size_t ZSTD_decodeLiteralsBlock(void* ctx,
+                          const void* src, size_t srcSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* const istart = (const BYTE* const)src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    default:
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;
+        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const size_t offsetPrefix[MaxOff+1] = {  /* note : size_t faster than U32 */
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    BYTE* const oLitEnd = op + sequence.litLength;
+    BYTE* const oMatchEnd = op + sequence.litLength + sequence.matchLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+
+    /* checks */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* overRead beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    {
+        const BYTE* match = op - sequence.offset;
+
+        /* check */
+        if (sequence.offset > (size_t)op) return ERROR(corruption_detected);   /* address space overflow test (this test seems kept by clang optimizer) */
+        //if (match > op) return ERROR(corruption_detected);   /* address space overflow test (is clang optimizer removing this test ?) */
+        if (match < base) return ERROR(corruption_detected);
+
+        /* close range match, overlap */
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        }
+        else
+        {
+            ZSTD_copy8(op, match);
+        }
+        op += 8; match += 8;
+
+        if (oMatchEnd > oend-(16-MINMATCH))
+        {
+            if (op < oend_8)
+            {
+                ZSTD_wildcopy(op, match, oend_8 - op);
+                match += oend_8 - op;
+                op = oend_8;
+            }
+            while (op < oMatchEnd) *op++ = *match++;
+        }
+        else
+        {
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        }
+    }
+
+    return oMatchEnd - ostart;
+}
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = sequence.offset = 4;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(ctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyUncompressedBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    ZSTD_DCtx ctx;
+    ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = MEM_readLE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTD_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}
+
+
+/* wrapper layer */
+
+unsigned ZSTDv03_isError(size_t code)
+{
+    return ZSTD_isError(code);
+}
+
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize)
+{
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+}
+
+size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    return ZSTD_findFrameCompressedSize(src, srcSize);
+}
+
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void)
+{
+    return (ZSTDv03_Dctx*)ZSTD_createDCtx();
+}
+
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+}
diff --git a/deps/SZ/zstd/legacy/zstd_v03.h b/deps/SZ/zstd/legacy/zstd_v03.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4449e2999e0fe9c017680c694b5d25c1a81451f
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v03.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V03_H_298734209782
+#define ZSTD_V03_H_298734209782
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv03_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.3.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv03_isError())
+*/
+size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+    /**
+ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error
+*/
+unsigned ZSTDv03_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx;
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void);
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx);
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv03_magicNumber 0xFD2FB523   /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V03_H_298734209782 */
diff --git a/deps/SZ/zstd/legacy/zstd_v04.c b/deps/SZ/zstd/legacy/zstd_v04.c
new file mode 100644
index 0000000000000000000000000000000000000000..a2e2cfa80db5a65769457012869c23a1867bb27e
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v04.c
@@ -0,0 +1,3628 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+ /******************************************
+ *  Includes
+ ******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+#include "zstd_v04.h"
+#include "error_private.h"
+
+
+/* ******************************************************************
+ *   mem.h
+ *******************************************************************/
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-*************************************
+*  Debug
+***************************************/
+#include "debug.h"
+#ifndef assert
+#  define assert(condition) ((void)0)
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+*/
+#ifndef ZSTD_STATIC_H
+#define ZSTD_STATIC_H
+
+
+/* *************************************
+*  Types
+***************************************/
+#define ZSTD_WINDOWLOG_MAX 26
+#define ZSTD_WINDOWLOG_MIN 18
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 11
+#define ZSTD_CONTENTLOG_MAX (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CONTENTLOG_MIN 4
+#define ZSTD_HASHLOG_MAX 28
+#define ZSTD_HASHLOG_MIN 4
+#define ZSTD_SEARCHLOG_MAX (ZSTD_CONTENTLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+#define ZSTD_SEARCHLENGTH_MAX 7
+#define ZSTD_SEARCHLENGTH_MIN 4
+
+/** from faster to stronger */
+typedef enum { ZSTD_fast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2 } ZSTD_strategy;
+
+typedef struct
+{
+    U64 srcSize;       /* optional : tells how much bytes are present in the frame. Use 0 if not known. */
+    U32 windowLog;     /* largest match distance : larger == more compression, more memory needed during decompression */
+    U32 contentLog;    /* full search segment : larger == more compression, slower, more memory (useless for fast) */
+    U32 hashLog;       /* dispatch table : larger == more memory, faster */
+    U32 searchLog;     /* nb of searches : larger == more compression, slower */
+    U32 searchLength;  /* size of matches : larger == faster decompression, sometimes less compression */
+    ZSTD_strategy strategy;
+} ZSTD_parameters;
+
+typedef ZSTDv04_Dctx ZSTD_DCtx;
+
+/* *************************************
+*  Advanced functions
+***************************************/
+/** ZSTD_decompress_usingDict
+*   Same as ZSTD_decompressDCtx, using a Dictionary content as prefix
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                             void* dst, size_t maxDstSize,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/* **************************************
+*  Streaming functions (direct mode)
+****************************************/
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
+static void   ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+
+/**
+  Streaming decompression, bufferless mode
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times. Use ZSTD_resetDCtx() to return to fresh status.
+
+  First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  This function doesn't consume its input. It needs enough input data to properly decode the frame header.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
+           >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
+
+  Then, you can optionally insert a dictionary.
+  This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+
+  Then it's possible to start decompression.
+  Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+
+
+#endif  /* ZSTD_STATIC_H */
+
+
+/*
+    zstd_internal - common functions to include
+    Header File for include
+*/
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* *************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/* *************************************
+*  Common constants
+***************************************/
+#define ZSTD_MAGICNUMBER 0xFD2FB524   /* v0.4 */
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize_min = 5;
+#define ZSTD_frameHeaderSize_max 5         /* define, for static allocation */
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define MINMATCH 4
+#define REPCODE_STARTVALUE 4
+
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits) - 1)
+#define MaxLL  ((1<<LLbits) - 1)
+#define MaxOff ((1<<Offbits)- 1)
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+
+/* ******************************************
+*  Shared functions to include for inlining
+********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file
+****************************************************************** */
+#ifndef FSE_H
+#define FSE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/* *****************************************
+*  FSE simple functions
+******************************************/
+static size_t FSE_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSE_isError()
+
+    ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/* *****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+
+
+
+/* *****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*!
+FSE_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSE_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+static  size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*!
+Constructor and Destructor of type FSE_DTable
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*!
+FSE_buildDTable():
+   Builds 'dt', which must be already allocated, using FSE_createDTable()
+   return : 0,
+            or an errorCode, which can be tested using FSE_isError() */
+static size_t FSE_buildDTable ( FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*!
+FSE_decompress_usingDTable():
+   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
+   into 'dst' which must be already allocated.
+   return : size of regenerated data (necessarily <= maxDstSize)
+            or an errorCode, which can be tested using FSE_isError() */
+static  size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
+'cSrcSize' must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=maxDstSize).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_H */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSE_STATIC_H
+#define FSE_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_STATIC_H */
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/*-**************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+
+static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_DTableHeader DTableH;
+    void* const tdPtr = dt+1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return a<0 ? -a : a;
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    U32 fastMode;
+
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    fastMode = DTableH.fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_H
+#define HUFF0_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependency
+******************************************/
+#include <stddef.h>    /* size_t */
+
+
+/* ****************************************
+*  Huff0 simple functions
+******************************************/
+static size_t HUF_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+HUF_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    'dstSize' must be the exact size of original (uncompressed) data.
+    Note : in contrast with FSE, HUF_decompress can regenerate RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUF_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    HUF_isError(size_t code);        /* tells if a return value is an error code */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFF0_H */
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_STATIC_H
+#define HUFF0_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Static allocation macros
+******************************************/
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+/* ****************************************
+*  Huff0 detailed API
+******************************************/
+/*!
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+
+*/
+static size_t HUF_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+static size_t HUF_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+static size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+static size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUFF0_STATIC_H */
+
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+
+/*-*******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)dtPtr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+
+
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
+
+
+/*
+    zstd - decompression module fo v0.4 legacy format
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/* *************************************
+*  Local types
+***************************************/
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/* *************************************
+*  Error Management
+***************************************/
+
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+/* *************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock } ZSTD_dStage;
+
+struct ZSTDv04_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTD_parameters params;
+    blockType_t bType;
+    ZSTD_dStage stage;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+    BYTE headerBuffer[ZSTD_frameHeaderSize_max];
+};  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+/** ZSTD_decodeFrameHeader_Part1
+*   decode the 1st part of the Frame Header, which tells Frame Header size.
+*   srcSize must be == ZSTD_frameHeaderSize_min
+*   @return : the full size of the Frame Header */
+static size_t ZSTD_decodeFrameHeader_Part1(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    zc->headerSize = ZSTD_frameHeaderSize_min;
+    return zc->headerSize;
+}
+
+
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize < ZSTD_frameHeaderSize_min) return ZSTD_frameHeaderSize_max;
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    memset(params, 0, sizeof(*params));
+    params->windowLog = (((const BYTE*)src)[4] & 15) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+    if ((((const BYTE*)src)[4] >> 4) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits */
+    return 0;
+}
+
+/** ZSTD_decodeFrameHeader_Part2
+*   decode the full Frame Header
+*   srcSize must be the size provided by ZSTD_decodeFrameHeader_Part1
+*   @return : 0, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader_Part2(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t result;
+    if (srcSize != zc->headerSize) return ERROR(srcSize_wrong);
+    result = ZSTD_getFrameParams(&(zc->params), src, srcSize);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize ) */
+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    /* compressed */
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    default:
+        return ERROR(corruption_detected);   /* forbidden nominal case */
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL >= MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    if (litLength == MaxLL) {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else {
+            litLength = dumps[0] + (dumps[1]<<8) + (dumps[2]<<16);
+            dumps += 3;
+        }
+        if (dumps > de) { litLength = MaxLL+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {   static const U32 offsetPrefix[MaxOff+1] = {
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+        if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML) {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else {
+            matchLength = dumps[0] + (dumps[1]<<8) + (dumps[2]<<16);
+            dumps += 3;
+        }
+        if (dumps > de) { matchLength = MaxML+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+    static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+    BYTE* const oLitEnd = op + sequence.litLength;
+    const size_t sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* risk read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base))
+    {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd)
+        {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {
+            size_t length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+        }
+    }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        const int sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH))
+    {
+        if (op < oend_8)
+        {
+            ZSTD_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    }
+    else
+    {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8, but must be signed */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTD_decompressSequences(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = 4;
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 4;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd)   /* not contiguous */
+    {
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(dctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* init */
+    ZSTD_resetDCtx(ctx);
+    if (dict)
+    {
+        ZSTD_decompress_insertDictionary(ctx, dict, dictSize);
+        ctx->dictEnd = ctx->previousDstEnd;
+        ctx->vBase = (const char*)dst - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+        ctx->base = dst;
+    }
+    else
+    {
+        ctx->vBase = ctx->base = ctx->dictEnd = dst;
+    }
+
+    /* Frame Header */
+    {
+        size_t frameHeaderSize;
+        if (srcSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part2(ctx, src, frameHeaderSize);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    if (MEM_readLE32(src) != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize_min; remainingSize -= ZSTD_frameHeaderSize_min;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/* ******************************
+*  Streaming Decompression API
+********************************/
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    ZSTD_checkContinuity(ctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (ctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        /* get frame header size */
+        if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        ctx->headerSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        if (ZSTD_isError(ctx->headerSize)) return ctx->headerSize;
+        memcpy(ctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
+        if (ctx->headerSize > ZSTD_frameHeaderSize_min) return ERROR(GENERIC);   /* impossible */
+        ctx->expected = 0;   /* not necessary to copy more */
+        /* fallthrough */
+    case ZSTDds_decodeFrameHeader:
+        /* get frame header */
+        {   size_t const result = ZSTD_decodeFrameHeader_Part2(ctx, ctx->headerBuffer, ctx->headerSize);
+            if (ZSTD_isError(result)) return result;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        /* Decode block header */
+        {   blockProperties_t bp;
+            size_t const blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(blockSize)) return blockSize;
+            if (bp.blockType == bt_end)
+            {
+                ctx->expected = 0;
+                ctx->stage = ZSTDds_getFrameHeaderSize;
+            }
+            else
+            {
+                ctx->expected = blockSize;
+                ctx->bType = bp.blockType;
+                ctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {
+            /* Decompress : block content */
+            size_t rSize;
+            switch(ctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTD_decompressBlock_internal(ctx, dst, maxDstSize, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, maxDstSize, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);
+            }
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* dict, size_t dictSize)
+{
+    ctx->dictEnd = ctx->previousDstEnd;
+    ctx->vBase = (const char*)dict - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+    ctx->base = dict;
+    ctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stdlib.h>
+
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst .
+*  return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  output : 128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_readHeader, ZBUFFds_loadHeader, ZBUFFds_decodeHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFF_dStage;
+
+/* *** Resource management *** */
+
+#define ZSTD_frameHeaderSize_max 5   /* too magical, should come from reference */
+struct ZBUFFv04_DCtx_s {
+    ZSTD_DCtx* zc;
+    ZSTD_parameters params;
+    char* inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char* outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t hPos;
+    const char* dict;
+    size_t dictSize;
+    ZBUFF_dStage stage;
+    unsigned char headerBuffer[ZSTD_frameHeaderSize_max];
+};   /* typedef'd to ZBUFF_DCtx within "zstd_buffered.h" */
+
+typedef ZBUFFv04_DCtx ZBUFF_DCtx;
+
+
+static ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    ZBUFF_DCtx* zbc = (ZBUFF_DCtx*)malloc(sizeof(ZBUFF_DCtx));
+    if (zbc==NULL) return NULL;
+    memset(zbc, 0, sizeof(*zbc));
+    zbc->zc = ZSTD_createDCtx();
+    zbc->stage = ZBUFFds_init;
+    return zbc;
+}
+
+static size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbc)
+{
+    if (zbc==NULL) return 0;   /* support free on null */
+    ZSTD_freeDCtx(zbc->zc);
+    free(zbc->inBuff);
+    free(zbc->outBuff);
+    free(zbc);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+static size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
+{
+    zbc->stage = ZBUFFds_readHeader;
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = zbc->dictSize = 0;
+    return ZSTD_resetDCtx(zbc->zc);
+}
+
+
+static size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, size_t srcSize)
+{
+    zbc->dict = (const char*)src;
+    zbc->dictSize = srcSize;
+    return 0;
+}
+
+static size_t ZBUFF_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    size_t length = MIN(maxDstSize, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+/* *** Decompression *** */
+
+static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* ip = istart;
+    const char* const iend = istart + *srcSizePtr;
+    char* const ostart = (char*)dst;
+    char* op = ostart;
+    char* const oend = ostart + *maxDstSizePtr;
+    U32 notDone = 1;
+
+    DEBUGLOG(5, "ZBUFF_decompressContinue");
+    while (notDone)
+    {
+        switch(zbc->stage)
+        {
+
+        case ZBUFFds_init :
+            DEBUGLOG(5, "ZBUFF_decompressContinue: stage==ZBUFFds_init => ERROR(init_missing)");
+            return ERROR(init_missing);
+
+        case ZBUFFds_readHeader :
+            /* read header from src */
+            {   size_t const headerSize = ZSTD_getFrameParams(&(zbc->params), src, *srcSizePtr);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
+                    zbc->hPos += *srcSizePtr;
+                    *maxDstSizePtr = 0;
+                    zbc->stage = ZBUFFds_loadHeader;
+                    return headerSize - zbc->hPos;
+                }
+                zbc->stage = ZBUFFds_decodeHeader;
+                break;
+            }
+
+        case ZBUFFds_loadHeader:
+            /* complete header from src */
+            {   size_t headerSize = ZBUFF_limitCopy(
+                    zbc->headerBuffer + zbc->hPos, ZSTD_frameHeaderSize_max - zbc->hPos,
+                    src, *srcSizePtr);
+                zbc->hPos += headerSize;
+                ip += headerSize;
+                headerSize = ZSTD_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    *maxDstSizePtr = 0;
+                    return headerSize - zbc->hPos;
+            }   }
+            /* intentional fallthrough */
+
+        case ZBUFFds_decodeHeader:
+                /* apply header to create / resize buffers */
+                {   size_t const neededOutSize = (size_t)1 << zbc->params.windowLog;
+                    size_t const neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
+                    if (zbc->inBuffSize < neededInSize) {
+                        free(zbc->inBuff);
+                        zbc->inBuffSize = neededInSize;
+                        zbc->inBuff = (char*)malloc(neededInSize);
+                        if (zbc->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    if (zbc->outBuffSize < neededOutSize) {
+                        free(zbc->outBuff);
+                        zbc->outBuffSize = neededOutSize;
+                        zbc->outBuff = (char*)malloc(neededOutSize);
+                        if (zbc->outBuff == NULL) return ERROR(memory_allocation);
+                }   }
+                if (zbc->dictSize)
+                    ZSTD_decompress_insertDictionary(zbc->zc, zbc->dict, zbc->dictSize);
+                if (zbc->hPos) {
+                    /* some data already loaded into headerBuffer : transfer into inBuff */
+                    memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
+                    zbc->inPos = zbc->hPos;
+                    zbc->hPos = 0;
+                    zbc->stage = ZBUFFds_load;
+                    break;
+                }
+                zbc->stage = ZBUFFds_read;
+		/* fall-through */
+        case ZBUFFds_read:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                if (neededInSize==0)   /* end of frame */
+                {
+                    zbc->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize)
+                {
+                    /* directly decode from src */
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbc->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                size_t toLoad = neededInSize - zbc->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbc->inBuffSize - zbc->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFF_limitCopy(zbc->inBuff + zbc->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbc->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+                {
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        zbc->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zbc->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbc->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    /* ZBUFFds_flush follows */
+                }
+            }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {
+                size_t toFlushSize = zbc->outEnd - zbc->outStart;
+                size_t flushedSize = ZBUFF_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
+                op += flushedSize;
+                zbc->outStart += flushedSize;
+                if (flushedSize == toFlushSize)
+                {
+                    zbc->stage = ZBUFFds_read;
+                    if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
+                        zbc->outStart = zbc->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+        }
+    }
+
+    *srcSizePtr = ip-istart;
+    *maxDstSizePtr = op-ostart;
+
+    {
+        size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+        if (nextSrcSizeHint > 3) nextSrcSizeHint+= 3;   /* get the next block header while at it */
+        nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv04_isError(size_t errorCode) { return ERR_isError(errorCode); }
+const char* ZBUFFv04_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+size_t ZBUFFv04_recommendedDInSize()  { return BLOCKSIZE + 3; }
+size_t ZBUFFv04_recommendedDOutSize() { return BLOCKSIZE; }
+
+
+
+/*- ========================================================================= -*/
+
+/* final wrapping stage */
+
+size_t ZSTDv04_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
+}
+
+size_t ZSTDv04_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE==1)
+    size_t regenSize;
+    ZSTD_DCtx* dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv04_decompressDCtx(dctx, dst, maxDstSize, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else
+    ZSTD_DCtx dctx;
+    return ZSTDv04_decompressDCtx(&dctx, dst, maxDstSize, src, srcSize);
+#endif
+}
+
+size_t ZSTDv04_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    return ZSTD_findFrameCompressedSize(src, srcSize);
+}
+
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx) { return ZSTD_resetDCtx(dctx); }
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress(dctx);
+}
+
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void) { return ZBUFF_createDCtx(); }
+size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx) { return ZBUFF_freeDCtx(dctx); }
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx) { return ZBUFF_decompressInit(dctx); }
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* src, size_t srcSize)
+{ return ZBUFF_decompressWithDictionary(dctx, src, srcSize); }
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    DEBUGLOG(5, "ZBUFFv04_decompressContinue");
+    return ZBUFF_decompressContinue(dctx, dst, maxDstSizePtr, src, srcSizePtr);
+}
+
+ZSTD_DCtx* ZSTDv04_createDCtx(void) { return ZSTD_createDCtx(); }
+size_t ZSTDv04_freeDCtx(ZSTD_DCtx* dctx) { return ZSTD_freeDCtx(dctx); }
+
+size_t ZSTDv04_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameParams(params, src, srcSize);
+}
diff --git a/deps/SZ/zstd/legacy/zstd_v04.h b/deps/SZ/zstd/legacy/zstd_v04.h
new file mode 100644
index 0000000000000000000000000000000000000000..6391631fc4327021188e7acf05973ebc9a33cfdb
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v04.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V04_H_91868324769238
+#define ZSTD_V04_H_91868324769238
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv04_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.4.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv04_isError())
+*/
+size_t ZSTDv04_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error
+*/
+unsigned ZSTDv04_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx;
+ZSTDv04_Dctx* ZSTDv04_createDCtx(void);
+size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+
+/* *************************************
+*  Direct Streaming
+***************************************/
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx);
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+
+/* *************************************
+*  Buffered Streaming
+***************************************/
+typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx;
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void);
+size_t         ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx);
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx);
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
+*  It must be the same content as the one set during compression phase.
+*  Dictionary content must remain accessible during the decompression process.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
+*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+unsigned ZBUFFv04_isError(size_t errorCode);
+const char* ZBUFFv04_getErrorName(size_t errorCode);
+
+
+/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are not compulsory, they just tend to offer better latency */
+size_t ZBUFFv04_recommendedDInSize(void);
+size_t ZBUFFv04_recommendedDOutSize(void);
+
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv04_magicNumber 0xFD2FB524   /* v0.4 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V04_H_91868324769238 */
diff --git a/deps/SZ/zstd/legacy/zstd_v05.c b/deps/SZ/zstd/legacy/zstd_v05.c
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1b1ffc8ceda9287bbca7f13e8e2b94733d55d
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v05.c
@@ -0,0 +1,4011 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include "zstd_v05.h"
+#include "error_private.h"
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+#ifndef ZSTD_STATIC_H
+#define ZSTD_STATIC_H
+
+/* The prototypes defined within this file are considered experimental.
+ * They should not be used in the context DLL as they may change in the future.
+ * Prefer static linking if you need them, to control breaking version changes issues.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-*************************************
+*  Types
+***************************************/
+#define ZSTDv05_WINDOWLOG_ABSOLUTEMIN 11
+
+
+/*-*************************************
+*  Advanced functions
+***************************************/
+/*- Advanced Decompression functions -*/
+
+/*! ZSTDv05_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv05_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv05_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference, which will not be modified, and 1 to run the decompression operation */
+size_t ZSTDv05_decompress_usingPreparedDCtx(
+                                             ZSTDv05_DCtx* dctx, const ZSTDv05_DCtx* preparedDCtx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize);
+
+
+/* **************************************
+*  Streaming functions (direct mode)
+****************************************/
+size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx);
+
+/*
+  Streaming decompression, direct mode (bufferless)
+
+  A ZSTDv05_DCtx object is required to track streaming operations.
+  Use ZSTDv05_createDCtx() / ZSTDv05_freeDCtx() to manage it.
+  A ZSTDv05_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTDv05_getFrameParams().
+  This operation is independent, and just needs enough input data to properly decode the frame header.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Result : 0 when successful, it means the ZSTDv05_parameters structure has been filled.
+           >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv05_isError()
+
+  Start decompression, with ZSTDv05_decompressBegin() or ZSTDv05_decompressBegin_usingDict()
+  Alternatively, you can copy a prepared context, using ZSTDv05_copyDCtx()
+
+  Then use ZSTDv05_nextSrcSizeToDecompress() and ZSTDv05_decompressContinue() alternatively.
+  ZSTDv05_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv05_decompressContinue().
+  ZSTDv05_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTDv05_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTDv05_decompressContinue() is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTDv05_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTDv05_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    User will have to take in charge required information to regenerate data, such as block sizes.
+
+    A few rules to respect :
+    - Uncompressed block size must be <= 128 KB
+    - Compressing or decompressing requires a context structure
+      + Use ZSTDv05_createCCtx() and ZSTDv05_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv05_compressBegin()
+      + decompression : ZSTDv05_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - When a block is considered not compressible enough, ZSTDv05_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv05_decompressBlock() doesn't accept uncompressed data as input !!
+*/
+
+size_t ZSTDv05_decompressBlock(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv05_STATIC_H */
+
+
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+*/
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv05_DICT_MAGIC  0xEC30A435
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+static const size_t ZSTDv05_blockHeaderSize = 3;
+static const size_t ZSTDv05_frameHeaderSize_min = 5;
+#define ZSTDv05_frameHeaderSize_max 5         /* define, for static allocation */
+
+#define BITv057 128
+#define BITv056  64
+#define BITv055  32
+#define BITv054  16
+#define BITv051   2
+#define BITv050   1
+
+#define IS_HUFv05 0
+#define IS_PCH 1
+#define IS_RAW 2
+#define IS_RLE 3
+
+#define MINMATCH 4
+#define REPCODE_STARTVALUE 1
+
+#define Litbits  8
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  ((1<<MLbits) - 1)
+#define MaxLL  ((1<<LLbits) - 1)
+#define MaxOff ((1<<Offbits)- 1)
+#define MLFSEv05Log   10
+#define LLFSEv05Log   10
+#define OffFSEv05Log   9
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define FSEv05_ENCODING_RAW     0
+#define FSEv05_ENCODING_RLE     1
+#define FSEv05_ENCODING_STATIC  2
+#define FSEv05_ENCODING_DYNAMIC 3
+
+
+#define HufLog 12
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define WILDCOPY_OVERLENGTH 8
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv05_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTDv05_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv05_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+MEM_STATIC void ZSTDv05_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+    /* opt */
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+} seqStore_t;
+
+
+
+#endif   /* ZSTDv05_CCOMMON_H_MODULE */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv05_H
+#define FSEv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-****************************************
+*  FSEv05 simple functions
+******************************************/
+size_t FSEv05_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+FSEv05_decompress():
+    Decompress FSEv05 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSEv05_isError()
+
+    ** Important ** : FSEv05_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/* *****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+unsigned    FSEv05_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv05_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+
+
+/* *****************************************
+*  FSEv05 detailed API
+******************************************/
+/* *** DECOMPRESSION *** */
+
+/*!
+FSEv05_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSEv05_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv05_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*!
+Constructor and Destructor of type FSEv05_DTable
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv05_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv05_DTable* FSEv05_createDTable(unsigned tableLog);
+void        FSEv05_freeDTable(FSEv05_DTable* dt);
+
+/*!
+FSEv05_buildDTable():
+   Builds 'dt', which must be already allocated, using FSEv05_createDTable()
+   @return : 0,
+             or an errorCode, which can be tested using FSEv05_isError() */
+size_t FSEv05_buildDTable (FSEv05_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*!
+FSEv05_decompress_usingDTable():
+   Decompress compressed source @cSrc of size @cSrcSize using `dt`
+   into `dst` which must be already allocated.
+   @return : size of regenerated data (necessarily <= @dstCapacity)
+             or an errorCode, which can be tested using FSEv05_isError() */
+size_t FSEv05_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv05_DTable* dt);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv05_H */
+/* ******************************************************************
+   bitstream
+   Part of FSEv05 library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITv05STREAM_H_MODULE
+#define BITv05STREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv05_DStream_t;
+
+typedef enum { BITv05_DStream_unfinished = 0,
+               BITv05_DStream_endOfBuffer = 1,
+               BITv05_DStream_completed = 2,
+               BITv05_DStream_overflow = 3 } BITv05_DStream_status;  /* result of BITv05_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv05_initDStream(BITv05_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv05_readBits(BITv05_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv05_DStream_status BITv05_reloadDStream(BITv05_DStream_t* bitD);
+MEM_STATIC unsigned BITv05_endOfDStream(const BITv05_DStream_t* bitD);
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BITv05_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*!BITv05_initDStream
+*  Initialize a BITv05_DStream_t.
+*  @bitD : a pointer to an already allocated BITv05_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv05_initDStream(BITv05_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t)) {  /* normal case */
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BITv05_highbit32(contain32);
+    } else {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BITv05_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BITv05_lookBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv05_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv05_lookBitsFast(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv05_skipBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv05_readBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BITv05_lookBits(bitD, nbBits);
+    BITv05_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BITv05_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BITv05_lookBitsFast(bitD, nbBits);
+    BITv05_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv05_DStream_status BITv05_reloadDStream(BITv05_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv05_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv05_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv05_DStream_endOfBuffer;
+        return BITv05_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv05_DStream_status result = BITv05_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv05_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv05_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BITv05_endOfDStream(const BITv05_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITv05STREAM_H_MODULE */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv05_STATIC_H
+#define FSEv05_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* It is possible to statically allocate FSEv05 CTable/DTable as a table of unsigned using below macros */
+#define FSEv05_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSEv05 advanced API
+*******************************************/
+size_t FSEv05_buildDTable_raw (FSEv05_DTable* dt, unsigned nbBits);
+/* build a fake FSEv05_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv05_buildDTable_rle (FSEv05_DTable* dt, unsigned char symbolValue);
+/* build a fake FSEv05_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSEv05 symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv05_DState_t;
+
+
+static void     FSEv05_initDState(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD, const FSEv05_DTable* dt);
+
+static unsigned char FSEv05_decodeSymbol(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD);
+
+static unsigned FSEv05_endOfDState(const FSEv05_DState_t* DStatePtr);
+
+
+
+/* *****************************************
+*  FSEv05 unsafe API
+*******************************************/
+static unsigned char FSEv05_decodeSymbolFast(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv05_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv05_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv05_initDState(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD, const FSEv05_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv05_DTableHeader* const DTableH = (const FSEv05_DTableHeader*)ptr;
+    DStatePtr->state = BITv05_readBits(bitD, DTableH->tableLog);
+    BITv05_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv05_peakSymbol(FSEv05_DState_t* DStatePtr)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC BYTE FSEv05_decodeSymbol(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BITv05_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSEv05_decodeSymbolFast(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BITv05_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSEv05_endOfDState(const FSEv05_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv05_STATIC_H */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSEv05_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv05_MAX_MEMORY_USAGE 14
+#define FSEv05_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv05_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv05_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv05_FUNCTION_TYPE BYTE
+#define FSEv05_FUNCTION_EXTENSION
+#define FSEv05_DECODE_TYPE FSEv05_decode_t
+
+
+#endif   /* !FSEv05_COMMONDEFS_ONLY */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv05_MAX_TABLELOG  (FSEv05_MAX_MEMORY_USAGE-2)
+#define FSEv05_MAX_TABLESIZE (1U<<FSEv05_MAX_TABLELOG)
+#define FSEv05_MAXTABLESIZE_MASK (FSEv05_MAX_TABLESIZE-1)
+#define FSEv05_DEFAULT_TABLELOG (FSEv05_DEFAULT_MEMORY_USAGE-2)
+#define FSEv05_MIN_TABLELOG 5
+
+#define FSEv05_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv05_MAX_TABLELOG > FSEv05_TABLELOG_ABSOLUTE_MAX
+#error "FSEv05_MAX_TABLELOG > FSEv05_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv05_STATIC_ASSERT(c) { enum { FSEv05_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv05_DTABLE_SIZE_U32(FSEv05_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv05_FUNCTION_EXTENSION
+#  error "FSEv05_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv05_FUNCTION_TYPE
+#  error "FSEv05_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv05_CAT(X,Y) X##Y
+#define FSEv05_FUNCTION_NAME(X,Y) FSEv05_CAT(X,Y)
+#define FSEv05_TYPE_NAME(X,Y) FSEv05_CAT(X,Y)
+
+
+/* Function templates */
+static U32 FSEv05_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+
+
+FSEv05_DTable* FSEv05_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv05_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv05_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv05_DTable*)malloc( FSEv05_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv05_freeDTable (FSEv05_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv05_buildDTable(FSEv05_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSEv05_DTableHeader DTableH;
+    void* const tdPtr = dt+1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv05_DECODE_TYPE* const tableDecode = (FSEv05_DECODE_TYPE*) (tdPtr);
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSEv05_tableStep(tableSize);
+    U16 symbolNext[FSEv05_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv05_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (normalizedCounter[s]==-1) {
+            tableDecode[highThreshold--].symbol = (FSEv05_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        } else {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+    }   }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++) {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++) {
+            tableDecode[position].symbol = (FSEv05_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+    }   }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++) {
+            FSEv05_FUNCTION_TYPE symbol = (FSEv05_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BITv05_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+    }   }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSEv05_COMMONDEFS_ONLY
+/*-****************************************
+*  FSEv05 helper functions
+******************************************/
+unsigned FSEv05_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSEv05 NCount encoding-decoding
+****************************************************************/
+static short FSEv05_abs(short a) { return a<0 ? -a : a; }
+
+
+size_t FSEv05_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv05_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv05_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv05_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv05_buildDTable_rle (FSEv05_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv05_DTableHeader* const DTableH = (FSEv05_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv05_decode_t* const cell = (FSEv05_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv05_buildDTable_raw (FSEv05_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv05_DTableHeader* const DTableH = (FSEv05_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv05_decode_t* const dinfo = (FSEv05_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv05_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv05_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv05_DStream_t bitD;
+    FSEv05_DState_t state1;
+    FSEv05_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BITv05_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSEv05_isError(errorCode)) return errorCode;
+
+    FSEv05_initDState(&state1, &bitD, dt);
+    FSEv05_initDState(&state2, &bitD, dt);
+
+#define FSEv05_GETSYMBOL(statePtr) fast ? FSEv05_decodeSymbolFast(statePtr, &bitD) : FSEv05_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv05_reloadDStream(&bitD)==BITv05_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv05_GETSYMBOL(&state1);
+
+        if (FSEv05_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv05_reloadDStream(&bitD);
+
+        op[1] = FSEv05_GETSYMBOL(&state2);
+
+        if (FSEv05_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv05_reloadDStream(&bitD) > BITv05_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv05_GETSYMBOL(&state1);
+
+        if (FSEv05_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv05_reloadDStream(&bitD);
+
+        op[3] = FSEv05_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv05_reloadDStream(&bitD) >= FSEv05_DStream_partiallyFilled; Ends at exactly BITv05_DStream_completed */
+    while (1) {
+        if ( (BITv05_reloadDStream(&bitD)>BITv05_DStream_completed) || (op==omax) || (BITv05_endOfDStream(&bitD) && (fast || FSEv05_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSEv05_GETSYMBOL(&state1);
+
+        if ( (BITv05_reloadDStream(&bitD)>BITv05_DStream_completed) || (op==omax) || (BITv05_endOfDStream(&bitD) && (fast || FSEv05_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSEv05_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BITv05_endOfDStream(&bitD) && FSEv05_endOfDState(&state1) && FSEv05_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+size_t FSEv05_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv05_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv05_DTableHeader* DTableH = (const FSEv05_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv05_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv05_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv05_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv05_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv05_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSEv05 decoding mode */
+    errorCode = FSEv05_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSEv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSEv05_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSEv05_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSEv05_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSEv05_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFF0_H
+#define HUFF0_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Huff0 simple functions
+******************************************/
+size_t HUFv05_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+HUFv05_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    @dstSize : must be the **exact** size of original (uncompressed) data.
+    Note : in contrast with FSEv05, HUFv05_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUFv05_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+unsigned    HUFv05_isError(size_t code);        /* tells if a return value is an error code */
+const char* HUFv05_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUF0_H */
+/* ******************************************************************
+   Huff0 : Huffman codec, part of New Generation Entropy library
+   header file, for static linking only
+   Copyright (C) 2013-2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUF0_STATIC_H
+#define HUF0_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* static allocation of Huff0's DTable */
+#define HUFv05_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUFv05_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUFv05_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv05_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUFv05_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv05_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUFv05_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv05_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv05_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+/* ****************************************
+*  Huff0 detailed API
+******************************************/
+/*!
+HUFv05_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv05_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv05_decompressSXn_usingDTable
+*/
+size_t HUFv05_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+size_t HUFv05_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+size_t HUFv05_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv05_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* single stream variants */
+
+size_t HUFv05_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv05_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv05_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv05_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUF0_STATIC_H */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUFv05_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUFv05_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv05_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUFv05_ABSOLUTEMAX_TABLELOG */
+#define HUFv05_DEFAULT_TABLELOG  HUFv05_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUFv05_MAX_SYMBOL_VALUE 255
+#if (HUFv05_MAX_TABLELOG > HUFv05_ABSOLUTEMAX_TABLELOG)
+#  error "HUFv05_MAX_TABLELOG is too large !"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+unsigned HUFv05_isError(size_t code) { return ERR_isError(code); }
+const char* HUFv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+#define HUFv05_STATIC_ASSERT(c) { enum { HUFv05_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* *******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUFv05_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv05_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUFv05_readStats
+    Read compact Huffman tree, saved by HUFv05_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUFv05_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSEv05 (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv05_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv05_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv05_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++) {
+        if (huffWeight[n] >= HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BITv05_highbit32(weightTotal) + 1;
+    if (tableLog > HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {   /* determine last weight */
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BITv05_highbit32(rest);
+        U32 lastWeight = BITv05_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+size_t HUFv05_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv05_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUFv05_DEltX2* const dt = (HUFv05_DEltX2*)dtPtr;
+
+    HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv05_readStats(huffWeight, HUFv05_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv05_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof allocated DTable, from used size of DTable, in case of re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++) {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++) {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUFv05_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUFv05_decodeSymbolX2(BITv05_DStream_t* Dstream, const HUFv05_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BITv05_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BITv05_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv05_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv05_MAX_TABLELOG<=12)) \
+        HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv05_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv05_decodeStreamX2(BYTE* p, BITv05_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv05_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv05_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p < pEnd))
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+size_t HUFv05_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const U32 dtLog = DTable[0];
+    const void* dtPtr = DTable;
+    const HUFv05_DEltX2* const dt = ((const HUFv05_DEltX2*)dtPtr)+1;
+    BITv05_DStream_t bitD;
+
+    if (dstSize <= cSrcSize) return ERROR(dstSize_tooSmall);
+    { size_t const errorCode = BITv05_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv05_isError(errorCode)) return errorCode; }
+
+    HUFv05_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv05_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv05_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX2(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUFv05_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv05_decompress1X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+size_t HUFv05_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    const void* const dtPtr = DTable;
+    const HUFv05_DEltX2* const dt = ((const HUFv05_DEltX2*)dtPtr) +1;
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+
+    /* Init */
+    BITv05_DStream_t bitD1;
+    BITv05_DStream_t bitD2;
+    BITv05_DStream_t bitD3;
+    BITv05_DStream_t bitD4;
+    const size_t length1 = MEM_readLE16(istart);
+    const size_t length2 = MEM_readLE16(istart+2);
+    const size_t length3 = MEM_readLE16(istart+4);
+    size_t length4;
+    const BYTE* const istart1 = istart + 6;  /* jumpTable */
+    const BYTE* const istart2 = istart1 + length1;
+    const BYTE* const istart3 = istart2 + length2;
+    const BYTE* const istart4 = istart3 + length3;
+    const size_t segmentSize = (dstSize+3) / 4;
+    BYTE* const opStart2 = ostart + segmentSize;
+    BYTE* const opStart3 = opStart2 + segmentSize;
+    BYTE* const opStart4 = opStart3 + segmentSize;
+    BYTE* op1 = ostart;
+    BYTE* op2 = opStart2;
+    BYTE* op3 = opStart3;
+    BYTE* op4 = opStart4;
+    U32 endSignal;
+
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    length4 = cSrcSize - (length1 + length2 + length3 + 6);
+    if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+    errorCode = BITv05_initDStream(&bitD1, istart1, length1);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD2, istart2, length2);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD3, istart3, length3);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD4, istart4, length4);
+    if (HUFv05_isError(errorCode)) return errorCode;
+
+    /* 16-32 symbols per loop (4-8 symbols per stream) */
+    endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+    for ( ; (endSignal==BITv05_DStream_unfinished) && (op4<(oend-7)) ; ) {
+        HUFv05_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_1(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_1(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_1(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_1(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_0(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_0(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_0(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_0(op4, &bitD4);
+        endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+    }
+
+    /* check corruption */
+    if (op1 > opStart2) return ERROR(corruption_detected);
+    if (op2 > opStart3) return ERROR(corruption_detected);
+    if (op3 > opStart4) return ERROR(corruption_detected);
+    /* note : op4 supposed already verified within main loop */
+
+    /* finish bitStreams one by one */
+    HUFv05_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+    HUFv05_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+    HUFv05_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+    HUFv05_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+    /* check */
+    endSignal = BITv05_endOfDStream(&bitD1) & BITv05_endOfDStream(&bitD2) & BITv05_endOfDStream(&bitD3) & BITv05_endOfDStream(&bitD4);
+    if (!endSignal) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+size_t HUFv05_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX2(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUFv05_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv05_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+static void HUFv05_fillDTableX4Level2(HUFv05_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv05_DEltX4 DElt;
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUFv05_ABSOLUTEMAX_TABLELOG][HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUFv05_fillDTableX4(HUFv05_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv05_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            U32 i;
+            const U32 end = start + length;
+            HUFv05_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv05_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv05_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUFv05_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUFv05_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUFv05_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUFv05_DEltX4* const dt = ((HUFv05_DEltX4*)dtPtr) + 1;
+
+    HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv05_readStats(weightList, HUFv05_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv05_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++) {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++) {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++) {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }
+
+    HUFv05_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUFv05_decodeSymbolX4(void* op, BITv05_DStream_t* DStream, const HUFv05_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv05_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv05_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv05_decodeLastSymbolX4(void* op, BITv05_DStream_t* DStream, const HUFv05_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv05_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv05_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv05_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv05_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv05_MAX_TABLELOG<=12)) \
+        ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv05_decodeStreamX4(BYTE* p, BITv05_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv05_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv05_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p <= pEnd-2))
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv05_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+size_t HUFv05_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+
+    const U32 dtLog = DTable[0];
+    const void* const dtPtr = DTable;
+    const HUFv05_DEltX4* const dt = ((const HUFv05_DEltX4*)dtPtr) +1;
+    size_t errorCode;
+
+    /* Init */
+    BITv05_DStream_t bitD;
+    errorCode = BITv05_initDStream(&bitD, istart, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+
+    /* finish bitStreams one by one */
+    HUFv05_decodeStreamX4(ostart, &bitD, oend,     dt, dtLog);
+
+    /* check */
+    if (!BITv05_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv05_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX4(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv05_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv05_decompress1X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+size_t HUFv05_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv05_DEltX4* const dt = ((const HUFv05_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv05_DStream_t bitD1;
+        BITv05_DStream_t bitD2;
+        BITv05_DStream_t bitD3;
+        BITv05_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv05_initDStream(&bitD1, istart1, length1);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD2, istart2, length2);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD3, istart3, length3);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD4, istart4, length4);
+        if (HUFv05_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv05_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv05_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv05_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv05_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv05_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv05_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv05_endOfDStream(&bitD1) & BITv05_endOfDStream(&bitD2) & BITv05_endOfDStream(&bitD3) & BITv05_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv05_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX4(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv05_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv05_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv05_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUFv05_decompress4X2, HUFv05_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize >= dstSize) return ERROR(corruption_detected);   /* invalid, or not compressed, but not compressed already dealt with */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUFv05_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv05_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUFv05_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv05_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv05_HEAPMODE
+#  define ZSTDv05_HEAPMODE 1
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug only : printf */
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/*-*************************************
+*  Local types
+***************************************/
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv05_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTDv05_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv05_isError(size_t code) { return ERR_isError(code); }
+
+
+/*! ZSTDv05_getErrorName() :
+*   provides error code string (useful for debugging) */
+const char* ZSTDv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* *************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDv05ds_getFrameHeaderSize, ZSTDv05ds_decodeFrameHeader,
+               ZSTDv05ds_decodeBlockHeader, ZSTDv05ds_decompressBlock } ZSTDv05_dStage;
+
+struct ZSTDv05_DCtx_s
+{
+    FSEv05_DTable LLTable[FSEv05_DTABLE_SIZE_U32(LLFSEv05Log)];
+    FSEv05_DTable OffTable[FSEv05_DTABLE_SIZE_U32(OffFSEv05Log)];
+    FSEv05_DTable MLTable[FSEv05_DTABLE_SIZE_U32(MLFSEv05Log)];
+    unsigned   hufTableX4[HUFv05_DTABLE_SIZE(HufLog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTDv05_parameters params;
+    blockType_t bType;   /* used in ZSTDv05_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv05_dStage stage;
+    U32 flagStaticTables;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv05_frameHeaderSize_max];
+};  /* typedef'd to ZSTDv05_DCtx within "zstd_static.h" */
+
+size_t ZSTDv05_sizeofDCtx (void) { return sizeof(ZSTDv05_DCtx); }
+
+size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx)
+{
+    dctx->expected = ZSTDv05_frameHeaderSize_min;
+    dctx->stage = ZSTDv05ds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTableX4[0] = HufLog;
+    dctx->flagStaticTables = 0;
+    return 0;
+}
+
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void)
+{
+    ZSTDv05_DCtx* dctx = (ZSTDv05_DCtx*)malloc(sizeof(ZSTDv05_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTDv05_decompressBegin(dctx);
+    return dctx;
+}
+
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx)
+{
+    free(dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv05_DCtx) - (BLOCKSIZE+WILDCOPY_OVERLENGTH + ZSTDv05_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv05_MAGICNUMBER (defined within zstd_internal.h)
+      - 1 byte  - Window Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+/* Block format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+
+/** ZSTDv05_decodeFrameHeader_Part1() :
+*   decode the 1st part of the Frame Header, which tells Frame Header size.
+*   srcSize must be == ZSTDv05_frameHeaderSize_min.
+*   @return : the full size of the Frame Header */
+static size_t ZSTDv05_decodeFrameHeader_Part1(ZSTDv05_DCtx* zc, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize != ZSTDv05_frameHeaderSize_min)
+        return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    zc->headerSize = ZSTDv05_frameHeaderSize_min;
+    return zc->headerSize;
+}
+
+
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize < ZSTDv05_frameHeaderSize_min) return ZSTDv05_frameHeaderSize_max;
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    memset(params, 0, sizeof(*params));
+    params->windowLog = (((const BYTE*)src)[4] & 15) + ZSTDv05_WINDOWLOG_ABSOLUTEMIN;
+    if ((((const BYTE*)src)[4] >> 4) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits */
+    return 0;
+}
+
+/** ZSTDv05_decodeFrameHeader_Part2() :
+*   decode the full Frame Header.
+*   srcSize must be the size provided by ZSTDv05_decodeFrameHeader_Part1().
+*   @return : 0, or an error code, which can be tested using ZSTDv05_isError() */
+static size_t ZSTDv05_decodeFrameHeader_Part2(ZSTDv05_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t result;
+    if (srcSize != zc->headerSize)
+        return ERROR(srcSize_wrong);
+    result = ZSTDv05_getFrameParams(&(zc->params), src, srcSize);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3)
+        return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv05_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(istart[0]>> 6)
+    {
+    case IS_HUFv05:
+        {
+            size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv05_isError(singleStream ?
+                            HUFv05_decompress1X2(dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv05_decompress   (dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_PCH:
+        {
+            size_t errorCode;
+            size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (!dctx->flagStaticTables)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            errorCode = HUFv05_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTableX4);
+            if (HUFv05_isError(errorCode)) return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_RAW:
+        {
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case IS_RLE:
+        {
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb,
+                         const void* src, size_t srcSize, U32 flagStaticTable)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE)
+        return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = *ip++;
+    if (*nbSeq==0) return 1;
+    if (*nbSeq >= 128) {
+        if (ip >= iend) return ERROR(srcSize_wrong);
+        *nbSeq = ((nbSeq[0]-128)<<8) + *ip++;
+    }
+
+    if (ip >= iend) return ERROR(srcSize_wrong);
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2) {
+        if (ip+3 > iend) return ERROR(srcSize_wrong);
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    } else {
+        if (ip+2 > iend) return ERROR(srcSize_wrong);
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL >= MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            LLlog = 0;
+            FSEv05_buildDTable_rle(DTableLL, *ip++);
+            break;
+        case FSEv05_ENCODING_RAW :
+            LLlog = LLbits;
+            FSEv05_buildDTable_raw(DTableLL, LLbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxLL;
+                headerSize = FSEv05_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSEv05_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case FSEv05_ENCODING_RAW :
+            Offlog = Offbits;
+            FSEv05_buildDTable_raw(DTableOffb, Offbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxOff;
+                headerSize = FSEv05_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSEv05_buildDTable_rle(DTableML, *ip++);
+            break;
+        case FSEv05_ENCODING_RAW :
+            MLlog = MLbits;
+            FSEv05_buildDTable_raw(DTableML, MLbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxML;
+                headerSize = FSEv05_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv05_DStream_t DStream;
+    FSEv05_DState_t stateLL;
+    FSEv05_DState_t stateOffb;
+    FSEv05_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+
+static void ZSTDv05_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSEv05_peakSymbol(&(seqState->stateLL));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    if (litLength == MaxLL) {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no risk : dumps is always followed by seq tables > 1 byte */
+            if (litLength&1) litLength>>=1, dumps += 3;
+            else litLength = (U16)(litLength)>>1, dumps += 2;
+        }
+        if (dumps > de) { litLength = MaxLL+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const U32 offsetPrefix[MaxOff+1] = {
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode = FSEv05_peakSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+        U32 nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BITv05_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BITv05_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* repcode, cmove */
+        if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+        FSEv05_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));    /* update */
+    }
+
+    /* Literal length update */
+    FSEv05_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));   /* update */
+    if (MEM_32bits()) BITv05_reloadDStream(&(seqState->DStream));
+
+    /* MatchLength */
+    matchLength = FSEv05_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML) {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            if (matchLength&1) matchLength>>=1, dumps += 3;
+            else matchLength = (U16)(matchLength)>>1, dumps += 2;
+        }
+        if (dumps > de) { matchLength = MaxML+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+
+#if 0   /* debug */
+    {
+        static U64 totalDecoded = 0;
+        printf("pos %6u : %3u literals & match %3u bytes at distance %6u \n",
+           (U32)(totalDecoded), (U32)litLength, (U32)matchLength, (U32)offset);
+        totalDecoded += litLength + matchLength;
+    }
+#endif
+}
+
+
+static size_t ZSTDv05_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+    static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+    BYTE* const oLitEnd = op + sequence.litLength;
+    const size_t sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* risk read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv05_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {
+            size_t length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        const int sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv05_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv05_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_8) {
+            ZSTDv05_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd)
+            *op++ = *match++;
+    } else {
+        ZSTDv05_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv05_decompressSequences(
+                               ZSTDv05_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTDv05_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, seqSize, dctx->flagStaticTables);
+    if (ZSTDv05_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = REPCODE_STARTVALUE;
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = REPCODE_STARTVALUE;
+        errorCode = BITv05_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSEv05_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv05_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv05_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv05_reloadDStream(&(seqState.DStream)) <= BITv05_DStream_completed) && nbSeq ; ) {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTDv05_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTDv05_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTDv05_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+    }
+
+    /* last literal segment */
+    {
+        size_t lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv05_checkContinuity(ZSTDv05_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv05_decompressBlock_internal(ZSTDv05_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    size_t litCSize;
+
+    if (srcSize >= BLOCKSIZE) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    litCSize = ZSTDv05_decodeLiteralsBlock(dctx, src, srcSize);
+    if (ZSTDv05_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTDv05_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv05_decompressBlock(ZSTDv05_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+/*! ZSTDv05_decompress_continueDCtx
+*   dctx must have been properly initialized */
+static size_t ZSTDv05_decompress_continueDCtx(ZSTDv05_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    {
+        size_t frameHeaderSize;
+        if (srcSize < ZSTDv05_frameHeaderSize_min+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
+        frameHeaderSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
+        if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+        frameHeaderSize = ZSTDv05_decodeFrameHeader_Part2(dctx, src, frameHeaderSize);
+        if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTDv05_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv05_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv05_blockHeaderSize;
+        remainingSize -= ZSTDv05_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv05_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv05_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv05_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+size_t ZSTDv05_decompress_usingPreparedDCtx(ZSTDv05_DCtx* dctx, const ZSTDv05_DCtx* refDCtx,
+                                         void* dst, size_t maxDstSize,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv05_copyDCtx(dctx, refDCtx);
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv05_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTDv05_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
+}
+
+size_t ZSTDv05_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv05_HEAPMODE) && (ZSTDv05_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv05_DCtx* dctx = ZSTDv05_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv05_decompressDCtx(dctx, dst, maxDstSize, src, srcSize);
+    ZSTDv05_freeDCtx(dctx);
+    return regenSize;
+#else
+    ZSTDv05_DCtx dctx;
+    return ZSTDv05_decompressDCtx(&dctx, dst, maxDstSize, src, srcSize);
+#endif
+}
+
+size_t ZSTDv05_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTDv05_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    if (MEM_readLE32(src) != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    ip += ZSTDv05_frameHeaderSize_min; remainingSize -= ZSTDv05_frameHeaderSize_min;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTDv05_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv05_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv05_blockHeaderSize;
+        remainingSize -= ZSTDv05_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/* ******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    ZSTDv05_checkContinuity(dctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (dctx->stage)
+    {
+    case ZSTDv05ds_getFrameHeaderSize :
+        /* get frame header size */
+        if (srcSize != ZSTDv05_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        dctx->headerSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
+        if (ZSTDv05_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv05_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv05_frameHeaderSize_min) return ERROR(GENERIC); /* should never happen */
+        dctx->expected = 0;   /* not necessary to copy more */
+        /* fallthrough */
+    case ZSTDv05ds_decodeFrameHeader:
+        /* get frame header */
+        {   size_t const result = ZSTDv05_decodeFrameHeader_Part2(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv05_isError(result)) return result;
+            dctx->expected = ZSTDv05_blockHeaderSize;
+            dctx->stage = ZSTDv05ds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDv05ds_decodeBlockHeader:
+        {
+            /* Decode block header */
+            blockProperties_t bp;
+            size_t blockSize = ZSTDv05_getcBlockSize(src, ZSTDv05_blockHeaderSize, &bp);
+            if (ZSTDv05_isError(blockSize)) return blockSize;
+            if (bp.blockType == bt_end) {
+                dctx->expected = 0;
+                dctx->stage = ZSTDv05ds_getFrameHeaderSize;
+            }
+            else {
+                dctx->expected = blockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDv05ds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDv05ds_decompressBlock:
+        {
+            /* Decompress : block content */
+            size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv05_decompressBlock_internal(dctx, dst, maxDstSize, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv05_copyRawBlock(dst, maxDstSize, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDv05ds_decodeBlockHeader;
+            dctx->expected = ZSTDv05_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTDv05_refDictContent(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+static size_t ZSTDv05_loadEntropy(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t hSize, offcodeHeaderSize, matchlengthHeaderSize, errorCode, litlengthHeaderSize;
+    short offcodeNCount[MaxOff+1];
+    U32 offcodeMaxValue=MaxOff, offcodeLog;
+    short matchlengthNCount[MaxML+1];
+    unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+    short litlengthNCount[MaxLL+1];
+    unsigned litlengthMaxValue = MaxLL, litlengthLog;
+
+    hSize = HUFv05_readDTableX4(dctx->hufTableX4, dict, dictSize);
+    if (HUFv05_isError(hSize)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + hSize;
+    dictSize -= hSize;
+
+    offcodeHeaderSize = FSEv05_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+    if (FSEv05_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+    if (offcodeLog > OffFSEv05Log) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + offcodeHeaderSize;
+    dictSize -= offcodeHeaderSize;
+
+    matchlengthHeaderSize = FSEv05_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+    if (FSEv05_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    if (matchlengthLog > MLFSEv05Log) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + matchlengthHeaderSize;
+    dictSize -= matchlengthHeaderSize;
+
+    litlengthHeaderSize = FSEv05_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+    if (litlengthLog > LLFSEv05Log) return ERROR(dictionary_corrupted);
+    if (FSEv05_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+
+    dctx->flagStaticTables = 1;
+    return hSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
+
+static size_t ZSTDv05_decompress_insertDictionary(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t eSize;
+    U32 magic = MEM_readLE32(dict);
+    if (magic != ZSTDv05_DICT_MAGIC) {
+        /* pure content mode */
+        ZSTDv05_refDictContent(dctx, dict, dictSize);
+        return 0;
+    }
+    /* load entropy tables */
+    dict = (const char*)dict + 4;
+    dictSize -= 4;
+    eSize = ZSTDv05_loadEntropy(dctx, dict, dictSize);
+    if (ZSTDv05_isError(eSize)) return ERROR(dictionary_corrupted);
+
+    /* reference dictionary content */
+    dict = (const char*)dict + eSize;
+    dictSize -= eSize;
+    ZSTDv05_refDictContent(dctx, dict, dictSize);
+
+    return 0;
+}
+
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t errorCode;
+    errorCode = ZSTDv05_decompressBegin(dctx);
+    if (ZSTDv05_isError(errorCode)) return errorCode;
+
+    if (dict && dictSize) {
+        errorCode = ZSTDv05_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+
+
+/* *************************************
+*  Constants
+***************************************/
+static size_t ZBUFFv05_blockHeaderSize = 3;
+
+
+
+/* *** Compression *** */
+
+static size_t ZBUFFv05_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    size_t length = MIN(maxDstSize, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFFv05_DCtx object is required to track streaming operation.
+*  Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+*  Use ZBUFFv05_decompressInit() to start a new decompression operation.
+*  ZBUFFv05_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst .
+*  return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFFv05_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  output : 128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+
+typedef enum { ZBUFFv05ds_init, ZBUFFv05ds_readHeader, ZBUFFv05ds_loadHeader, ZBUFFv05ds_decodeHeader,
+               ZBUFFv05ds_read, ZBUFFv05ds_load, ZBUFFv05ds_flush } ZBUFFv05_dStage;
+
+/* *** Resource management *** */
+
+#define ZSTDv05_frameHeaderSize_max 5   /* too magical, should come from reference */
+struct ZBUFFv05_DCtx_s {
+    ZSTDv05_DCtx* zc;
+    ZSTDv05_parameters params;
+    char* inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char* outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t hPos;
+    ZBUFFv05_dStage stage;
+    unsigned char headerBuffer[ZSTDv05_frameHeaderSize_max];
+};   /* typedef'd to ZBUFFv05_DCtx within "zstd_buffered.h" */
+
+
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void)
+{
+    ZBUFFv05_DCtx* zbc = (ZBUFFv05_DCtx*)malloc(sizeof(ZBUFFv05_DCtx));
+    if (zbc==NULL) return NULL;
+    memset(zbc, 0, sizeof(*zbc));
+    zbc->zc = ZSTDv05_createDCtx();
+    zbc->stage = ZBUFFv05ds_init;
+    return zbc;
+}
+
+size_t ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* zbc)
+{
+    if (zbc==NULL) return 0;   /* support free on null */
+    ZSTDv05_freeDCtx(zbc->zc);
+    free(zbc->inBuff);
+    free(zbc->outBuff);
+    free(zbc);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* zbc, const void* dict, size_t dictSize)
+{
+    zbc->stage = ZBUFFv05ds_readHeader;
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = 0;
+    return ZSTDv05_decompressBegin_usingDict(zbc->zc, dict, dictSize);
+}
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* zbc)
+{
+    return ZBUFFv05_decompressInitDictionary(zbc, NULL, 0);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* zbc, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* ip = istart;
+    const char* const iend = istart + *srcSizePtr;
+    char* const ostart = (char*)dst;
+    char* op = ostart;
+    char* const oend = ostart + *maxDstSizePtr;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbc->stage)
+        {
+        case ZBUFFv05ds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFv05ds_readHeader :
+            /* read header from src */
+            {
+                size_t headerSize = ZSTDv05_getFrameParams(&(zbc->params), src, *srcSizePtr);
+                if (ZSTDv05_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
+                    zbc->hPos += *srcSizePtr;
+                    *maxDstSizePtr = 0;
+                    zbc->stage = ZBUFFv05ds_loadHeader;
+                    return headerSize - zbc->hPos;
+                }
+                zbc->stage = ZBUFFv05ds_decodeHeader;
+                break;
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_loadHeader:
+            /* complete header from src */
+            {
+                size_t headerSize = ZBUFFv05_limitCopy(
+                    zbc->headerBuffer + zbc->hPos, ZSTDv05_frameHeaderSize_max - zbc->hPos,
+                    src, *srcSizePtr);
+                zbc->hPos += headerSize;
+                ip += headerSize;
+                headerSize = ZSTDv05_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
+                if (ZSTDv05_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    *maxDstSizePtr = 0;
+                    return headerSize - zbc->hPos;
+                }
+                // zbc->stage = ZBUFFv05ds_decodeHeader; break;   /* useless : stage follows */
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_decodeHeader:
+                /* apply header to create / resize buffers */
+                {
+                    size_t neededOutSize = (size_t)1 << zbc->params.windowLog;
+                    size_t neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
+                    if (zbc->inBuffSize < neededInSize) {
+                        free(zbc->inBuff);
+                        zbc->inBuffSize = neededInSize;
+                        zbc->inBuff = (char*)malloc(neededInSize);
+                        if (zbc->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    if (zbc->outBuffSize < neededOutSize) {
+                        free(zbc->outBuff);
+                        zbc->outBuffSize = neededOutSize;
+                        zbc->outBuff = (char*)malloc(neededOutSize);
+                        if (zbc->outBuff == NULL) return ERROR(memory_allocation);
+                }   }
+                if (zbc->hPos) {
+                    /* some data already loaded into headerBuffer : transfer into inBuff */
+                    memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
+                    zbc->inPos = zbc->hPos;
+                    zbc->hPos = 0;
+                    zbc->stage = ZBUFFv05ds_load;
+                    break;
+                }
+                zbc->stage = ZBUFFv05ds_read;
+		/* fall-through */
+        case ZBUFFv05ds_read:
+            {
+                size_t neededInSize = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+                if (neededInSize==0) {  /* end of frame */
+                    zbc->stage = ZBUFFv05ds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {
+                    /* directly decode from src */
+                    size_t decodedSize = ZSTDv05_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        ip, neededInSize);
+                    if (ZSTDv05_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFv05ds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbc->stage = ZBUFFv05ds_load;
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_load:
+            {
+                size_t neededInSize = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+                size_t toLoad = neededInSize - zbc->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbc->inBuffSize - zbc->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv05_limitCopy(zbc->inBuff + zbc->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbc->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+                {
+                    size_t decodedSize = ZSTDv05_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        zbc->inBuff, neededInSize);
+                    if (ZSTDv05_isError(decodedSize)) return decodedSize;
+                    zbc->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbc->stage = ZBUFFv05ds_read; break; }   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFv05ds_flush;
+                    // break; /* ZBUFFv05ds_flush follows */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFv05ds_flush:
+            {
+                size_t toFlushSize = zbc->outEnd - zbc->outStart;
+                size_t flushedSize = ZBUFFv05_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
+                op += flushedSize;
+                zbc->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbc->stage = ZBUFFv05ds_read;
+                    if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
+                        zbc->outStart = zbc->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    *srcSizePtr = ip-istart;
+    *maxDstSizePtr = op-ostart;
+
+    {   size_t nextSrcSizeHint = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+        if (nextSrcSizeHint > ZBUFFv05_blockHeaderSize) nextSrcSizeHint+= ZBUFFv05_blockHeaderSize;   /* get next block header too */
+        nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode) { return ERR_isError(errorCode); }
+const char* ZBUFFv05_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+size_t ZBUFFv05_recommendedDInSize(void)  { return BLOCKSIZE + ZBUFFv05_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv05_recommendedDOutSize(void) { return BLOCKSIZE; }
diff --git a/deps/SZ/zstd/legacy/zstd_v05.h b/deps/SZ/zstd/legacy/zstd_v05.h
new file mode 100644
index 0000000000000000000000000000000000000000..b68fd578ee9f08fb7d9c69d8cb640a02cf8bca76
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v05.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv05_H
+#define ZSTDv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>   /* size_t */
+#include "mem.h"      /* U64, U32 */
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv05_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */
+size_t ZSTDv05_decompress( void* dst, size_t dstCapacity,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv05_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv05_isError())
+*/
+size_t ZSTDv05_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/* *************************************
+*  Helper functions
+***************************************/
+/* Error Management */
+unsigned    ZSTDv05_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+const char* ZSTDv05_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx;
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void);
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv05_decompressDCtx() :
+*   Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Simple Dictionary API
+*************************/
+/*! ZSTDv05_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const void* dict,size_t dictSize);
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy;
+typedef struct {
+    U64 srcSize;
+    U32 windowLog;     /* the only useful information to retrieve */
+    U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy;
+} ZSTDv05_parameters;
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize);
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize);
+void   ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx);
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx);
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  ZBUFF API
+*************************/
+typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx;
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void);
+size_t         ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx);
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx);
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression
+*
+*  A ZBUFFv05_DCtx object is required to track streaming operations.
+*  Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+*  Use ZBUFFv05_decompressInit() to start a new decompression operation,
+*   or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv05_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFFv05_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize()
+*  output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode);
+const char* ZBUFFv05_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, and tend to offer better latency */
+size_t ZBUFFv05_recommendedDInSize(void);
+size_t ZBUFFv05_recommendedDOutSize(void);
+
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv05_MAGICNUMBER 0xFD2FB525   /* v0.5 */
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv0505_H */
diff --git a/deps/SZ/zstd/legacy/zstd_v06.c b/deps/SZ/zstd/legacy/zstd_v06.c
new file mode 100644
index 0000000000000000000000000000000000000000..8b068b3e546408ea8ff588952aa7bd1fce4be501
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v06.c
@@ -0,0 +1,4124 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include "zstd_v06.h"
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+#include <stdlib.h>    /* malloc, free, qsort */
+#include "error_private.h"
+
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+#ifndef ZSTDv06_STATIC_H
+#define ZSTDv06_STATIC_H
+
+/* The prototypes defined within this file are considered experimental.
+ * They should not be used in the context DLL as they may change in the future.
+ * Prefer static linking if you need them, to control breaking version changes issues.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*- Advanced Decompression functions -*/
+
+/*! ZSTDv06_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv06_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv06_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingPreparedDCtx(
+                                           ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize);
+
+
+
+#define ZSTDv06_FRAMEHEADERSIZE_MAX 13    /* for static allocation */
+static const size_t ZSTDv06_frameHeaderSize_min = 5;
+static const size_t ZSTDv06_frameHeaderSize_max = ZSTDv06_FRAMEHEADERSIZE_MAX;
+
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx);
+
+/*
+  Streaming decompression, direct mode (bufferless)
+
+  A ZSTDv06_DCtx object is required to track streaming operations.
+  Use ZSTDv06_createDCtx() / ZSTDv06_freeDCtx() to manage it.
+  A ZSTDv06_DCtx object can be re-used multiple times.
+
+  First optional operation is to retrieve frame parameters, using ZSTDv06_getFrameParams(), which doesn't consume the input.
+  It can provide the minimum size of rolling buffer required to properly decompress data,
+  and optionally the final size of uncompressed content.
+  (Note : content size is an optional info that may not be present. 0 means : content size unknown)
+  Frame parameters are extracted from the beginning of compressed frame.
+  The amount of data to read is variable, from ZSTDv06_frameHeaderSize_min to ZSTDv06_frameHeaderSize_max (so if `srcSize` >= ZSTDv06_frameHeaderSize_max, it will always work)
+  If `srcSize` is too small for operation to succeed, function will return the minimum size it requires to produce a result.
+  Result : 0 when successful, it means the ZSTDv06_frameParams structure has been filled.
+          >0 : means there is not enough data into `src`. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv06_isError()
+
+  Start decompression, with ZSTDv06_decompressBegin() or ZSTDv06_decompressBegin_usingDict().
+  Alternatively, you can copy a prepared context, using ZSTDv06_copyDCtx().
+
+  Then use ZSTDv06_nextSrcSizeToDecompress() and ZSTDv06_decompressContinue() alternatively.
+  ZSTDv06_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv06_decompressContinue().
+  ZSTDv06_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTDv06_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTDv06_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity)
+  It can be zero, which is not an error; it just means ZSTDv06_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTDv06_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Uncompressed block size must be <= ZSTDv06_BLOCKSIZE_MAX (128 KB)
+    - Compressing or decompressing requires a context structure
+      + Use ZSTDv06_createCCtx() and ZSTDv06_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv06_compressBegin()
+      + decompression : ZSTDv06_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - When a block is considered not compressible enough, ZSTDv06_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv06_decompressBlock() doesn't accept uncompressed data as input !!
+*/
+
+#define ZSTDv06_BLOCKSIZE_MAX (128 * 1024)   /* define, for static allocation */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBlock(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv06_STATIC_H */
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : https://www.zstd.net
+*/
+#ifndef ZSTDv06_CCOMMON_H_MODULE
+#define ZSTDv06_CCOMMON_H_MODULE
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv06_DICT_MAGIC  0xEC30A436
+
+#define ZSTDv06_REP_NUM    3
+#define ZSTDv06_REP_INIT   ZSTDv06_REP_NUM
+#define ZSTDv06_REP_MOVE   (ZSTDv06_REP_NUM-1)
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTDv06_WINDOWLOG_ABSOLUTEMIN 12
+static const size_t ZSTDv06_fcs_fieldSize[4] = { 0, 1, 2, 8 };
+
+#define ZSTDv06_BLOCKHEADERSIZE 3   /* because C standard does not allow a static const value to be defined using another static const value .... :( */
+static const size_t ZSTDv06_blockHeaderSize = ZSTDv06_BLOCKHEADERSIZE;
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+
+#define IS_HUF 0
+#define IS_PCH 1
+#define IS_RAW 2
+#define IS_RLE 3
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+#define REPCODE_STARTVALUE 1
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  52
+#define MaxLL  35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+
+#define FSEv06_ENCODING_RAW     0
+#define FSEv06_ENCODING_RLE     1
+#define FSEv06_ENCODING_STATIC  2
+#define FSEv06_ENCODING_DYNAMIC 3
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+static const U32 LL_defaultNormLog = 6;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+static const U32 ML_defaultNormLog = 6;
+
+static const S16 OF_defaultNorm[MaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                              1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+static const U32 OF_defaultNormLog = 5;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv06_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTDv06_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv06_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTDv06_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTDv06_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTDv06_REP_INIT];
+} ZSTDv06_optimal_t;
+
+typedef struct { U32  unused; } ZSTDv06_stats_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* litStart;
+    BYTE* lit;
+    U16*  litLengthStart;
+    U16*  litLength;
+    BYTE* llCodeStart;
+    U16*  matchLengthStart;
+    U16*  matchLength;
+    BYTE* mlCodeStart;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+    /* opt */
+    ZSTDv06_optimal_t* priceTable;
+    ZSTDv06_match_t* matchTable;
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  matchSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+    U32  log2matchLengthSum;
+    U32  log2matchSum;
+    U32  log2litLengthSum;
+    U32  log2litSum;
+    U32  log2offCodeSum;
+    U32  factor;
+    U32  cachedPrice;
+    U32  cachedLitLength;
+    const BYTE* cachedLiterals;
+    ZSTDv06_stats_t stats;
+} seqStore_t;
+
+void ZSTDv06_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq);
+
+
+#endif   /* ZSTDv06_CCOMMON_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef FSEv06_H
+#define FSEv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSEv06_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSEv06_isError() .
+
+    ** Important ** : FSEv06_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+size_t FSEv06_decompress(void* dst,  size_t dstCapacity,
+                const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+size_t FSEv06_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+unsigned    FSEv06_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv06_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+
+FSEv06_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSEv06_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSEv06_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv06_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSEv06_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv06_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv06_DTable* FSEv06_createDTable(unsigned tableLog);
+void        FSEv06_freeDTable(FSEv06_DTable* dt);
+
+/*! FSEv06_buildDTable():
+    Builds 'dt', which must be already allocated, using FSEv06_createDTable().
+    return : 0, or an errorCode, which can be tested using FSEv06_isError() */
+size_t FSEv06_buildDTable (FSEv06_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSEv06_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSEv06_isError() */
+size_t FSEv06_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv06_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSEv06_readNCount() if it was saved using FSEv06_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSEv06_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSEv06_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError().
+
+The next step is to build the decompression tables 'FSEv06_DTable' from 'normalizedCounter'.
+This is performed by the function FSEv06_buildDTable().
+The space required by 'FSEv06_DTable' must be already allocated using FSEv06_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError().
+
+`FSEv06_DTable` can then be used to decompress `cSrc`, with FSEv06_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSEv06_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError(). (ex: dst buffer too small)
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv06_H */
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv06_DStream_t;
+
+typedef enum { BITv06_DStream_unfinished = 0,
+               BITv06_DStream_endOfBuffer = 1,
+               BITv06_DStream_completed = 2,
+               BITv06_DStream_overflow = 3 } BITv06_DStream_status;  /* result of BITv06_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv06_initDStream(BITv06_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv06_readBits(BITv06_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv06_DStream_status BITv06_reloadDStream(BITv06_DStream_t* bitD);
+MEM_STATIC unsigned BITv06_endOfDStream(const BITv06_DStream_t* bitD);
+
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BITv06_highbit32 ( U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BITv06_initDStream() :
+*   Initialize a BITv06_DStream_t.
+*   `bitD` : a pointer to an already allocated BITv06_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv06_initDStream(BITv06_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          if (lastByte == 0) return ERROR(GENERIC);   /* endMark not present */
+          bitD->bitsConsumed = 8 - BITv06_highbit32(lastByte); }
+    } else {
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          if (lastByte == 0) return ERROR(GENERIC);   /* endMark not present */
+          bitD->bitsConsumed = 8 - BITv06_highbit32(lastByte); }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+ MEM_STATIC size_t BITv06_lookBits(const BITv06_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv06_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv06_lookBitsFast(const BITv06_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv06_skipBits(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv06_readBits(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv06_lookBits(bitD, nbBits);
+    BITv06_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BITv06_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv06_lookBitsFast(bitD, nbBits);
+    BITv06_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv06_DStream_status BITv06_reloadDStream(BITv06_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv06_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv06_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv06_DStream_endOfBuffer;
+        return BITv06_DStream_completed;
+    }
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv06_DStream_status result = BITv06_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv06_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv06_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+MEM_STATIC unsigned BITv06_endOfDStream(const BITv06_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv06_STATIC_H
+#define FSEv06_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSEv06_NCOUNTBOUND 512
+#define FSEv06_BLOCKBOUND(size) (size + (size>>7))
+#define FSEv06_COMPRESSBOUND(size) (FSEv06_NCOUNTBOUND + FSEv06_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSEv06_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+size_t FSEv06_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+/* same as FSEv06_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr  */
+
+size_t FSEv06_buildDTable_raw (FSEv06_DTable* dt, unsigned nbBits);
+/* build a fake FSEv06_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv06_buildDTable_rle (FSEv06_DTable* dt, unsigned char symbolValue);
+/* build a fake FSEv06_DTable, designed to always generate the same symbolValue */
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv06_DState_t;
+
+
+static void     FSEv06_initDState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD, const FSEv06_DTable* dt);
+
+static unsigned char FSEv06_decodeSymbol(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD);
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSEv06_decodeSymbolFast(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv06_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv06_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv06_initDState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD, const FSEv06_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv06_DTableHeader* const DTableH = (const FSEv06_DTableHeader*)ptr;
+    DStatePtr->state = BITv06_readBits(bitD, DTableH->tableLog);
+    BITv06_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv06_peekSymbol(const FSEv06_DState_t* DStatePtr)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSEv06_updateState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BITv06_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSEv06_decodeSymbol(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv06_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSEv06_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSEv06_decodeSymbolFast(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv06_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+
+
+#ifndef FSEv06_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv06_MAX_MEMORY_USAGE 14
+#define FSEv06_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv06_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv06_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv06_FUNCTION_TYPE BYTE
+#define FSEv06_FUNCTION_EXTENSION
+#define FSEv06_DECODE_TYPE FSEv06_decode_t
+
+
+#endif   /* !FSEv06_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv06_MAX_TABLELOG  (FSEv06_MAX_MEMORY_USAGE-2)
+#define FSEv06_MAX_TABLESIZE (1U<<FSEv06_MAX_TABLELOG)
+#define FSEv06_MAXTABLESIZE_MASK (FSEv06_MAX_TABLESIZE-1)
+#define FSEv06_DEFAULT_TABLELOG (FSEv06_DEFAULT_MEMORY_USAGE-2)
+#define FSEv06_MIN_TABLELOG 5
+
+#define FSEv06_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv06_MAX_TABLELOG > FSEv06_TABLELOG_ABSOLUTE_MAX
+#error "FSEv06_MAX_TABLELOG > FSEv06_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSEv06_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv06_STATIC_H */
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+
+/*-****************************************
+*  FSE Error Management
+******************************************/
+unsigned FSEv06_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  HUF Error Management
+****************************************************************/
+unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
+
+const char* HUFv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSEv06_abs(short a) { return a<0 ? -a : a; }
+
+size_t FSEv06_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv06_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv06_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {   short const max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv06_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) && (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv06_isError ERR_isError
+#define FSEv06_STATIC_ASSERT(c) { enum { FSEv06_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv06_DTABLE_SIZE_U32(FSEv06_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv06_FUNCTION_EXTENSION
+#  error "FSEv06_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv06_FUNCTION_TYPE
+#  error "FSEv06_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv06_CAT(X,Y) X##Y
+#define FSEv06_FUNCTION_NAME(X,Y) FSEv06_CAT(X,Y)
+#define FSEv06_TYPE_NAME(X,Y) FSEv06_CAT(X,Y)
+
+
+/* Function templates */
+FSEv06_DTable* FSEv06_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv06_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv06_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv06_DTable*)malloc( FSEv06_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv06_freeDTable (FSEv06_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv06_buildDTable(FSEv06_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv06_DECODE_TYPE* const tableDecode = (FSEv06_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSEv06_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv06_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv06_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSEv06_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSEv06_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSEv06_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSEv06_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSEv06_FUNCTION_TYPE const symbol = (FSEv06_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BITv06_highbit32 ((U32)nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+
+#ifndef FSEv06_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv06_buildDTable_rle (FSEv06_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv06_DTableHeader* const DTableH = (FSEv06_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv06_decode_t* const cell = (FSEv06_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv06_buildDTable_raw (FSEv06_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv06_DTableHeader* const DTableH = (FSEv06_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv06_decode_t* const dinfo = (FSEv06_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv06_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv06_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv06_DStream_t bitD;
+    FSEv06_DState_t state1;
+    FSEv06_DState_t state2;
+
+    /* Init */
+    { size_t const errorCode = BITv06_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+      if (FSEv06_isError(errorCode)) return errorCode; }
+
+    FSEv06_initDState(&state1, &bitD, dt);
+    FSEv06_initDState(&state2, &bitD, dt);
+
+#define FSEv06_GETSYMBOL(statePtr) fast ? FSEv06_decodeSymbolFast(statePtr, &bitD) : FSEv06_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv06_reloadDStream(&bitD)==BITv06_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv06_GETSYMBOL(&state1);
+
+        if (FSEv06_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv06_reloadDStream(&bitD);
+
+        op[1] = FSEv06_GETSYMBOL(&state2);
+
+        if (FSEv06_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv06_reloadDStream(&bitD) > BITv06_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv06_GETSYMBOL(&state1);
+
+        if (FSEv06_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv06_reloadDStream(&bitD);
+
+        op[3] = FSEv06_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv06_reloadDStream(&bitD) >= FSEv06_DStream_partiallyFilled; Ends at exactly BITv06_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv06_GETSYMBOL(&state1);
+
+        if (BITv06_reloadDStream(&bitD)==BITv06_DStream_overflow) {
+            *op++ = FSEv06_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv06_GETSYMBOL(&state2);
+
+        if (BITv06_reloadDStream(&bitD)==BITv06_DStream_overflow) {
+            *op++ = FSEv06_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSEv06_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv06_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv06_DTableHeader* DTableH = (const FSEv06_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv06_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv06_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv06_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv06_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv06_MAX_SYMBOL_VALUE;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    {   size_t const NCountLength = FSEv06_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+        if (FSEv06_isError(NCountLength)) return NCountLength;
+        if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    { size_t const errorCode = FSEv06_buildDTable (dt, counting, maxSymbolValue, tableLog);
+      if (FSEv06_isError(errorCode)) return errorCode; }
+
+    return FSEv06_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);   /* always return, even if it is an error code */
+}
+
+
+
+#endif   /* FSEv06_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv06_H
+#define HUFv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  HUF simple functions
+******************************************/
+size_t HUFv06_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*
+HUFv06_decompress() :
+    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    `dstSize` : must be the **exact** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUFv06_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUFv06_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+size_t HUFv06_compressBound(size_t size);       /**< maximum compressed size */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFv06_H */
+/* ******************************************************************
+   Huffman codec, part of New Generation Entropy library
+   header file, for static linking only
+   Copyright (C) 2013-2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv06_STATIC_H
+#define HUFv06_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUFv06_CTABLEBOUND 129
+#define HUFv06_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUFv06_COMPRESSBOUND(size) (HUFv06_CTABLEBOUND + HUFv06_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's DTable */
+#define HUFv06_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUFv06_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUFv06_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv06_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUFv06_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv06_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUFv06_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv06_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv06_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+
+/*!
+HUFv06_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv06_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv06_decompressSXn_usingDTable
+*/
+size_t HUFv06_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+size_t HUFv06_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+size_t HUFv06_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv06_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* single stream variants */
+size_t HUFv06_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv06_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv06_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv06_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUFv06_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUFv06_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv06_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUFv06_ABSOLUTEMAX_TABLELOG */
+#define HUFv06_DEFAULT_TABLELOG  HUFv06_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUFv06_MAX_SYMBOL_VALUE 255
+#if (HUFv06_MAX_TABLELOG > HUFv06_ABSOLUTEMAX_TABLELOG)
+#  error "HUFv06_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*! HUFv06_readStats() :
+    Read compact Huffman tree, saved by HUFv06_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src`
+*/
+MEM_STATIC size_t HUFv06_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static U32 l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            {   U32 n;
+                for (n=0; n<oSize; n+=2) {
+                    huffWeight[n]   = ip[n/2] >> 4;
+                    huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv06_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv06_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv06_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BITv06_highbit32(weightTotal) + 1;
+        if (tableLog > HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BITv06_highbit32(rest);
+            U32 const lastWeight = BITv06_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUFv06_STATIC_H */
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUFv06_STATIC_ASSERT(c) { enum { HUFv06_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+
+/* *******************************************************
+*  HUF : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUFv06_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv06_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+size_t HUFv06_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv06_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUFv06_DEltX2* const dt = (HUFv06_DEltX2*)dtPtr;
+
+    HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv06_readStats(huffWeight, HUFv06_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv06_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof allocated DTable, from used size of DTable, in case of re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<tableLog+1; n++) {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++) {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUFv06_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+
+static BYTE HUFv06_decodeSymbolX2(BITv06_DStream_t* Dstream, const HUFv06_DEltX2* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    const BYTE c = dt[val].byte;
+    BITv06_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv06_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv06_MAX_TABLELOG<=12)) \
+        HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv06_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv06_decodeStreamX2(BYTE* p, BITv06_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv06_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv06_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p < pEnd))
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+size_t HUFv06_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const U32 dtLog = DTable[0];
+    const void* dtPtr = DTable;
+    const HUFv06_DEltX2* const dt = ((const HUFv06_DEltX2*)dtPtr)+1;
+    BITv06_DStream_t bitD;
+
+    { size_t const errorCode = BITv06_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv06_isError(errorCode)) return errorCode; }
+
+    HUFv06_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv06_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv06_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX2(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const errorCode = HUFv06_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv06_decompress1X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+size_t HUFv06_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv06_DEltX2* const dt = ((const HUFv06_DEltX2*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv06_DStream_t bitD1;
+        BITv06_DStream_t bitD2;
+        BITv06_DStream_t bitD3;
+        BITv06_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv06_initDStream(&bitD1, istart1, length1);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD2, istart2, length2);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD3, istart3, length3);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD4, istart4, length4);
+        if (HUFv06_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv06_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv06_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv06_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv06_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv06_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv06_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv06_endOfDStream(&bitD1) & BITv06_endOfDStream(&bitD2) & BITv06_endOfDStream(&bitD3) & BITv06_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv06_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX2(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const errorCode = HUFv06_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv06_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+static void HUFv06_fillDTableX4Level2(HUFv06_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv06_DEltX4 DElt;
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    { U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }}
+}
+
+typedef U32 rankVal_t[HUFv06_ABSOLUTEMAX_TABLELOG][HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUFv06_fillDTableX4(HUFv06_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv06_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUFv06_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 u;
+                const U32 end = start + length;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv06_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv06_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUFv06_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUFv06_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUFv06_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUFv06_DEltX4* const dt = ((HUFv06_DEltX4*)dtPtr) + 1;
+
+    HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv06_readStats(weightList, HUFv06_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv06_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < memLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUFv06_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUFv06_decodeSymbolX4(void* op, BITv06_DStream_t* DStream, const HUFv06_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv06_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv06_decodeLastSymbolX4(void* op, BITv06_DStream_t* DStream, const HUFv06_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv06_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv06_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv06_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv06_MAX_TABLELOG<=12)) \
+        ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv06_decodeStreamX4(BYTE* p, BITv06_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv06_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv06_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p <= pEnd-2))
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv06_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+size_t HUFv06_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+
+    const U32 dtLog = DTable[0];
+    const void* const dtPtr = DTable;
+    const HUFv06_DEltX4* const dt = ((const HUFv06_DEltX4*)dtPtr) +1;
+
+    /* Init */
+    BITv06_DStream_t bitD;
+    { size_t const errorCode = BITv06_initDStream(&bitD, istart, cSrcSize);
+      if (HUFv06_isError(errorCode)) return errorCode; }
+
+    /* decode */
+    HUFv06_decodeStreamX4(ostart, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv06_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv06_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX4(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv06_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv06_decompress1X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+size_t HUFv06_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv06_DEltX4* const dt = ((const HUFv06_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv06_DStream_t bitD1;
+        BITv06_DStream_t bitD2;
+        BITv06_DStream_t bitD3;
+        BITv06_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv06_initDStream(&bitD1, istart1, length1);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD2, istart2, length2);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD3, istart3, length3);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD4, istart4, length4);
+        if (HUFv06_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv06_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv06_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv06_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv06_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv06_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv06_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv06_endOfDStream(&bitD1) & BITv06_endOfDStream(&bitD2) & BITv06_endOfDStream(&bitD3) & BITv06_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv06_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX4(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv06_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv06_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv06_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUFv06_decompress4X2, HUFv06_decompress4X4, NULL };
+    U32 Dtime[3];   /* decompression time estimation */
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    {   U32 const Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 n; for (n=0; n<3; n++)
+            Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+    }
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    {   U32 algoNb = 0;
+        if (Dtime[1] < Dtime[0]) algoNb = 1;
+        // if (Dtime[2] < Dtime[algoNb]) algoNb = 2;   /* current speed of HUFv06_decompress4X6 is not good */
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+
+    //return HUFv06_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv06_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUFv06_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    Common functions of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+/*-****************************************
+*  Version
+******************************************/
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTDv06_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv06_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTDv06_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZSTDv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  ZBUFF Error Management
+****************************************************************/
+unsigned ZBUFFv06_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZBUFFv06_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv06_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv06_HEAPMODE
+#  define ZSTDv06_HEAPMODE 1
+#endif
+
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTDv06_isError ERR_isError   /* for inlining */
+#define FSEv06_isError  ERR_isError
+#define HUFv06_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv06_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock } ZSTDv06_dStage;
+
+struct ZSTDv06_DCtx_s
+{
+    FSEv06_DTable LLTable[FSEv06_DTABLE_SIZE_U32(LLFSELog)];
+    FSEv06_DTable OffTable[FSEv06_DTABLE_SIZE_U32(OffFSELog)];
+    FSEv06_DTable MLTable[FSEv06_DTABLE_SIZE_U32(MLFSELog)];
+    unsigned   hufTableX4[HUFv06_DTABLE_SIZE(HufLog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTDv06_frameParams fParams;
+    blockType_t bType;   /* used in ZSTDv06_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv06_dStage stage;
+    U32 flagRepeatTable;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[ZSTDv06_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTDv06_DCtx within "zstd_static.h" */
+
+size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }   /* non published interface */
+
+size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx)
+{
+    dctx->expected = ZSTDv06_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTableX4[0] = HufLog;
+    dctx->flagRepeatTable = 0;
+    return 0;
+}
+
+ZSTDv06_DCtx* ZSTDv06_createDCtx(void)
+{
+    ZSTDv06_DCtx* dctx = (ZSTDv06_DCtx*)malloc(sizeof(ZSTDv06_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTDv06_decompressBegin(dctx);
+    return dctx;
+}
+
+size_t ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx)
+{
+    free(dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv06_copyDCtx(ZSTDv06_DCtx* dstDCtx, const ZSTDv06_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv06_DCtx) - (ZSTDv06_BLOCKSIZE_MAX+WILDCOPY_OVERLENGTH + ZSTDv06_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv06_MAGICNUMBER (defined within zstd_static.h)
+      - 1 byte  - Frame Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+
+/* Frame descriptor
+
+   1 byte, using :
+   bit 0-3 : windowLog - ZSTDv06_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
+   bit 4   : minmatch 4(0) or 3(1)
+   bit 5   : reserved (must be zero)
+   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
+
+   Optional : content size (0, 1, 2 or 8 bytes)
+   0 : unknown
+   1 : 0-255 bytes
+   2 : 256 - 65535+256
+   8 : up to 16 exa
+*/
+
+
+/* Compressed Block, format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+/** ZSTDv06_frameHeaderSize() :
+*   srcSize must be >= ZSTDv06_frameHeaderSize_min.
+*   @return : size of the Frame Header */
+static size_t ZSTDv06_frameHeaderSize(const void* src, size_t srcSize)
+{
+    if (srcSize < ZSTDv06_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    { U32 const fcsId = (((const BYTE*)src)[4]) >> 6;
+      return ZSTDv06_frameHeaderSize_min + ZSTDv06_fcs_fieldSize[fcsId]; }
+}
+
+
+/** ZSTDv06_getFrameParams() :
+*   decode Frame Header, or provide expected `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTDv06_isError() */
+size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize < ZSTDv06_frameHeaderSize_min) return ZSTDv06_frameHeaderSize_min;
+    if (MEM_readLE32(src) != ZSTDv06_MAGICNUMBER) return ERROR(prefix_unknown);
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    { size_t const fhsize = ZSTDv06_frameHeaderSize(src, srcSize);
+      if (srcSize < fhsize) return fhsize; }
+
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+    {   BYTE const frameDesc = ip[4];
+        fparamsPtr->windowLog = (frameDesc & 0xF) + ZSTDv06_WINDOWLOG_ABSOLUTEMIN;
+        if ((frameDesc & 0x20) != 0) return ERROR(frameParameter_unsupported);   /* reserved 1 bit */
+        switch(frameDesc >> 6)  /* fcsId */
+        {
+            default:   /* impossible */
+            case 0 : fparamsPtr->frameContentSize = 0; break;
+            case 1 : fparamsPtr->frameContentSize = ip[5]; break;
+            case 2 : fparamsPtr->frameContentSize = MEM_readLE16(ip+5)+256; break;
+            case 3 : fparamsPtr->frameContentSize = MEM_readLE64(ip+5); break;
+    }   }
+    return 0;
+}
+
+
+/** ZSTDv06_decodeFrameHeader() :
+*   `srcSize` must be the size provided by ZSTDv06_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTDv06_isError() */
+static size_t ZSTDv06_decodeFrameHeader(ZSTDv06_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t const result = ZSTDv06_getFrameParams(&(zc->fParams), src, srcSize);
+    if ((MEM_32bits()) && (zc->fParams.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTDv06_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    U32 cSize;
+
+    if (srcSize < ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    bpPtr->blockType = (blockType_t)((*in) >> 6);
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv06_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(istart[0]>> 6)
+    {
+    case IS_HUF:
+        {   size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > ZSTDv06_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv06_isError(singleStream ?
+                            HUFv06_decompress1X2(dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv06_decompress   (dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_PCH:
+        {   size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (!dctx->flagRepeatTable)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            {   size_t const errorCode = HUFv06_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTableX4);
+                if (HUFv06_isError(errorCode)) return ERROR(corruption_detected);
+            }
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_RAW:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case IS_RLE:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > ZSTDv06_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+/*! ZSTDv06_buildSeqTable() :
+    @return : nb bytes read from src,
+              or an error code if it fails, testable with ZSTDv06_isError()
+*/
+size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case FSEv06_ENCODING_RLE :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        FSEv06_buildDTable_rle(DTable, *(const BYTE*)src);   /* if *src > max, data is corrupted */
+        return 1;
+    case FSEv06_ENCODING_RAW :
+        FSEv06_buildDTable(DTable, defaultNorm, max, defaultLog);
+        return 0;
+    case FSEv06_ENCODING_STATIC:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    default :   /* impossible */
+    case FSEv06_ENCODING_DYNAMIC :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSEv06_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSEv06_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            FSEv06_buildDTable(DTable, norm, max, tableLog);
+            return headerSize;
+    }   }
+}
+
+
+size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
+                             FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    {   U32 const LLtype  = *ip >> 6;
+        U32 const Offtype = (*ip >> 4) & 3;
+        U32 const MLtype  = (*ip >> 2) & 3;
+        ip++;
+
+        /* check */
+        if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+        /* Build DTables */
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+        }
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableOffb, Offtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+        }
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+    }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv06_DStream_t DStream;
+    FSEv06_DState_t stateLL;
+    FSEv06_DState_t stateOffb;
+    FSEv06_DState_t stateML;
+    size_t prevOffset[ZSTDv06_REP_INIT];
+} seqState_t;
+
+
+
+static void ZSTDv06_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    /* Literal length */
+    U32 const llCode = FSEv06_peekSymbol(&(seqState->stateLL));
+    U32 const mlCode = FSEv06_peekSymbol(&(seqState->stateML));
+    U32 const ofCode = FSEv06_peekSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,   11,    12,    13,    14,    15,
+                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,   27,    28,    29,    30,    31,
+                            32, 34, 36, 38, 40, 44, 48, 56, 64, 80, 96, 0x80, 0x100, 0x200, 0x400, 0x800,
+                            0x1000, 0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       3,       7,     0xF,     0x1F,     0x3F,     0x7F,
+                 0xFF,   0x1FF,   0x3FF,   0x7FF,   0xFFF,   0x1FFF,   0x3FFF,   0x7FFF,
+                 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
+                 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, /*fake*/ 1, 1 };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            offset = OF_base[ofCode] + BITv06_readBits(&(seqState->DStream), ofBits);   /* <=  26 bits */
+            if (MEM_32bits()) BITv06_reloadDStream(&(seqState->DStream));
+        }
+
+        if (offset < ZSTDv06_REP_NUM) {
+            if (llCode == 0 && offset <= 1) offset = 1-offset;
+
+            if (offset != 0) {
+                size_t temp = seqState->prevOffset[offset];
+                if (offset != 1) {
+                    seqState->prevOffset[2] = seqState->prevOffset[1];
+                }
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            offset -= ZSTDv06_REP_MOVE;
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq->offset = offset;
+    }
+
+    seq->matchLength = ML_base[mlCode] + MINMATCH + ((mlCode>31) ? BITv06_readBits(&(seqState->DStream), mlBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BITv06_reloadDStream(&(seqState->DStream));
+
+    seq->litLength = LL_base[llCode] + ((llCode>15) ? BITv06_readBits(&(seqState->DStream), llBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BITv06_reloadDStream(&(seqState->DStream));
+
+    /* ANS state update */
+    FSEv06_updateState(&(seqState->stateLL), &(seqState->DStream));   /* <=  9 bits */
+    FSEv06_updateState(&(seqState->stateML), &(seqState->DStream));   /* <=  9 bits */
+    if (MEM_32bits()) BITv06_reloadDStream(&(seqState->DStream));     /* <= 18 bits */
+    FSEv06_updateState(&(seqState->stateOffb), &(seqState->DStream)); /* <=  8 bits */
+}
+
+
+size_t ZSTDv06_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv06_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv06_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv06_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_8) {
+            ZSTDv06_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTDv06_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv06_decompressSequences(
+                               ZSTDv06_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    FSEv06_DTable* DTableLL = dctx->LLTable;
+    FSEv06_DTable* DTableML = dctx->MLTable;
+    FSEv06_DTable* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTDv06_decodeSeqHeaders(&nbSeq, DTableLL, DTableML, DTableOffb, dctx->flagRepeatTable, ip, seqSize);
+        if (ZSTDv06_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        dctx->flagRepeatTable = 0;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = REPCODE_STARTVALUE;
+        { U32 i; for (i=0; i<ZSTDv06_REP_INIT; i++) seqState.prevOffset[i] = REPCODE_STARTVALUE; }
+        { size_t const errorCode = BITv06_initDStream(&(seqState.DStream), ip, iend-ip);
+          if (ERR_isError(errorCode)) return ERROR(corruption_detected); }
+        FSEv06_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv06_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv06_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv06_reloadDStream(&(seqState.DStream)) <= BITv06_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            ZSTDv06_decodeSequence(&sequence, &seqState);
+
+#if 0  /* debug */
+            static BYTE* start = NULL;
+            if (start==NULL) start = op;
+            size_t pos = (size_t)(op-start);
+            if ((pos >= 5810037) && (pos < 5810400))
+                printf("Dpos %6u :%5u literals & match %3u bytes at distance %6u \n",
+                       pos, (U32)sequence.litLength, (U32)sequence.matchLength, (U32)sequence.offset);
+#endif
+
+            {   size_t const oneSeqSize = ZSTDv06_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                if (ZSTDv06_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv06_checkContinuity(ZSTDv06_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv06_decompressBlock_internal(ZSTDv06_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize >= ZSTDv06_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    {   size_t const litCSize = ZSTDv06_decodeLiteralsBlock(dctx, src, srcSize);
+        if (ZSTDv06_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+    return ZSTDv06_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv06_decompressBlock(ZSTDv06_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+/*! ZSTDv06_decompressFrame() :
+*   `dctx` must be properly initialized */
+static size_t ZSTDv06_decompressFrame(ZSTDv06_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstCapacity;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties = { bt_compressed, 0 };
+
+    /* check */
+    if (srcSize < ZSTDv06_frameHeaderSize_min+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+        if (ZSTDv06_decodeFrameHeader(dctx, src, frameHeaderSize)) return ERROR(corruption_detected);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize=0;
+        size_t const cBlockSize = ZSTDv06_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv06_blockHeaderSize;
+        remainingSize -= ZSTDv06_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv06_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv06_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv06_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+size_t ZSTDv06_decompress_usingPreparedDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* refDCtx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv06_copyDCtx(dctx, refDCtx);
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv06_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTDv06_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTDv06_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv06_HEAPMODE) && (ZSTDv06_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv06_DCtx* dctx = ZSTDv06_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv06_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTDv06_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTDv06_DCtx dctx;
+    return ZSTDv06_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+size_t ZSTDv06_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties = { bt_compressed, 0 };
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(frameHeaderSize)) return frameHeaderSize;
+        if (MEM_readLE32(src) != ZSTDv06_MAGICNUMBER) return ERROR(prefix_unknown);
+        if (srcSize < frameHeaderSize+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t const cBlockSize = ZSTDv06_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv06_blockHeaderSize;
+        remainingSize -= ZSTDv06_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*_******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    if (dstCapacity) ZSTDv06_checkContinuity(dctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        if (srcSize != ZSTDv06_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        dctx->headerSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv06_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv06_frameHeaderSize_min) {
+            dctx->expected = dctx->headerSize - ZSTDv06_frameHeaderSize_min;
+            dctx->stage = ZSTDds_decodeFrameHeader;
+            return 0;
+        }
+        dctx->expected = 0;   /* not necessary to copy more */
+	/* fall-through */
+    case ZSTDds_decodeFrameHeader:
+        {   size_t result;
+            memcpy(dctx->headerBuffer + ZSTDv06_frameHeaderSize_min, src, dctx->expected);
+            result = ZSTDv06_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv06_isError(result)) return result;
+            dctx->expected = ZSTDv06_blockHeaderSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTDv06_getcBlockSize(src, ZSTDv06_blockHeaderSize, &bp);
+            if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+            if (bp.blockType == bt_end) {
+                dctx->expected = 0;
+                dctx->stage = ZSTDds_getFrameHeaderSize;
+            } else {
+                dctx->expected = cBlockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv06_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv06_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTDv06_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTDv06_refDictContent(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+static size_t ZSTDv06_loadEntropy(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t hSize, offcodeHeaderSize, matchlengthHeaderSize, litlengthHeaderSize;
+
+    hSize = HUFv06_readDTableX4(dctx->hufTableX4, dict, dictSize);
+    if (HUFv06_isError(hSize)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + hSize;
+    dictSize -= hSize;
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue=MaxOff, offcodeLog;
+        offcodeHeaderSize = FSEv06_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+        if (FSEv06_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dict = (const char*)dict + offcodeHeaderSize;
+        dictSize -= offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        matchlengthHeaderSize = FSEv06_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+        if (FSEv06_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dict = (const char*)dict + matchlengthHeaderSize;
+        dictSize -= matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        litlengthHeaderSize = FSEv06_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+        if (FSEv06_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+    }
+
+    dctx->flagRepeatTable = 1;
+    return hSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
+
+static size_t ZSTDv06_decompress_insertDictionary(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t eSize;
+    U32 const magic = MEM_readLE32(dict);
+    if (magic != ZSTDv06_DICT_MAGIC) {
+        /* pure content mode */
+        ZSTDv06_refDictContent(dctx, dict, dictSize);
+        return 0;
+    }
+    /* load entropy tables */
+    dict = (const char*)dict + 4;
+    dictSize -= 4;
+    eSize = ZSTDv06_loadEntropy(dctx, dict, dictSize);
+    if (ZSTDv06_isError(eSize)) return ERROR(dictionary_corrupted);
+
+    /* reference dictionary content */
+    dict = (const char*)dict + eSize;
+    dictSize -= eSize;
+    ZSTDv06_refDictContent(dctx, dict, dictSize);
+
+    return 0;
+}
+
+
+size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    { size_t const errorCode = ZSTDv06_decompressBegin(dctx);
+      if (ZSTDv06_isError(errorCode)) return errorCode; }
+
+    if (dict && dictSize) {
+        size_t const errorCode = ZSTDv06_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv06_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv06_DCtx object is required to track streaming operations.
+*  Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+*  Use ZBUFFv06_decompressInit() to start a new decompression operation,
+*   or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv06_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+*  output : ZBUFFv06_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv06_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_loadHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFFv06_dStage;
+
+/* *** Resource management *** */
+struct ZBUFFv06_DCtx_s {
+    ZSTDv06_DCtx* zd;
+    ZSTDv06_frameParams fParams;
+    ZBUFFv06_dStage stage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t blockSize;
+    BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
+    size_t lhSize;
+};   /* typedef'd to ZBUFFv06_DCtx within "zstd_buffered.h" */
+
+
+ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void)
+{
+    ZBUFFv06_DCtx* zbd = (ZBUFFv06_DCtx*)malloc(sizeof(ZBUFFv06_DCtx));
+    if (zbd==NULL) return NULL;
+    memset(zbd, 0, sizeof(*zbd));
+    zbd->zd = ZSTDv06_createDCtx();
+    zbd->stage = ZBUFFds_init;
+    return zbd;
+}
+
+size_t ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* zbd)
+{
+    if (zbd==NULL) return 0;   /* support free on null */
+    ZSTDv06_freeDCtx(zbd->zd);
+    free(zbd->inBuff);
+    free(zbd->outBuff);
+    free(zbd);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    zbd->stage = ZBUFFds_loadHeader;
+    zbd->lhSize = zbd->inPos = zbd->outStart = zbd->outEnd = 0;
+    return ZSTDv06_decompressBegin_usingDict(zbd->zd, dict, dictSize);
+}
+
+size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* zbd)
+{
+    return ZBUFFv06_decompressInitDictionary(zbd, NULL, 0);
+}
+
+
+
+MEM_STATIC size_t ZBUFFv06_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* const iend = istart + *srcSizePtr;
+    const char* ip = istart;
+    char* const ostart = (char*)dst;
+    char* const oend = ostart + *dstCapacityPtr;
+    char* op = ostart;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbd->stage)
+        {
+        case ZBUFFds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFds_loadHeader :
+            {   size_t const hSize = ZSTDv06_getFrameParams(&(zbd->fParams), zbd->headerBuffer, zbd->lhSize);
+                if (hSize != 0) {
+                    size_t const toLoad = hSize - zbd->lhSize;   /* if hSize!=0, hSize > zbd->lhSize */
+                    if (ZSTDv06_isError(hSize)) return hSize;
+                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
+                        memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
+                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        *dstCapacityPtr = 0;
+                        return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    memcpy(zbd->headerBuffer + zbd->lhSize, ip, toLoad); zbd->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* Consume header */
+            {   size_t const h1Size = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);  /* == ZSTDv06_frameHeaderSize_min */
+                size_t const h1Result = ZSTDv06_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer, h1Size);
+                if (ZSTDv06_isError(h1Result)) return h1Result;
+                if (h1Size < zbd->lhSize) {   /* long header */
+                    size_t const h2Size = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                    size_t const h2Result = ZSTDv06_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer+h1Size, h2Size);
+                    if (ZSTDv06_isError(h2Result)) return h2Result;
+            }   }
+
+            /* Frame header instruct buffer sizes */
+            {   size_t const blockSize = MIN(1 << zbd->fParams.windowLog, ZSTDv06_BLOCKSIZE_MAX);
+                zbd->blockSize = blockSize;
+                if (zbd->inBuffSize < blockSize) {
+                    free(zbd->inBuff);
+                    zbd->inBuffSize = blockSize;
+                    zbd->inBuff = (char*)malloc(blockSize);
+                    if (zbd->inBuff == NULL) return ERROR(memory_allocation);
+                }
+                {   size_t const neededOutSize = ((size_t)1 << zbd->fParams.windowLog) + blockSize + WILDCOPY_OVERLENGTH * 2;
+                    if (zbd->outBuffSize < neededOutSize) {
+                        free(zbd->outBuff);
+                        zbd->outBuffSize = neededOutSize;
+                        zbd->outBuff = (char*)malloc(neededOutSize);
+                        if (zbd->outBuff == NULL) return ERROR(memory_allocation);
+            }   }   }
+            zbd->stage = ZBUFFds_read;
+	    /* fall-through */
+        case ZBUFFds_read:
+            {   size_t const neededInSize = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                if (neededInSize==0) {  /* end of frame */
+                    zbd->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    size_t const decodedSize = ZSTDv06_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        ip, neededInSize);
+                    if (ZSTDv06_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbd->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {   size_t const neededInSize = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                size_t const toLoad = neededInSize - zbd->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbd->inBuffSize - zbd->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv06_limitCopy(zbd->inBuff + zbd->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbd->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {   size_t const decodedSize = ZSTDv06_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        zbd->inBuff, neededInSize);
+                    if (ZSTDv06_isError(decodedSize)) return decodedSize;
+                    zbd->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbd->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    // break; /* ZBUFFds_flush follows */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {   size_t const toFlushSize = zbd->outEnd - zbd->outStart;
+                size_t const flushedSize = ZBUFFv06_limitCopy(op, oend-op, zbd->outBuff + zbd->outStart, toFlushSize);
+                op += flushedSize;
+                zbd->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbd->stage = ZBUFFds_read;
+                    if (zbd->outStart + zbd->blockSize > zbd->outBuffSize)
+                        zbd->outStart = zbd->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    *srcSizePtr = ip-istart;
+    *dstCapacityPtr = op-ostart;
+    {   size_t nextSrcSizeHint = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+        if (nextSrcSizeHint > ZSTDv06_blockHeaderSize) nextSrcSizeHint+= ZSTDv06_blockHeaderSize;   /* get following block header too */
+        nextSrcSizeHint -= zbd->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFFv06_recommendedDInSize(void)  { return ZSTDv06_BLOCKSIZE_MAX + ZSTDv06_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv06_recommendedDOutSize(void) { return ZSTDv06_BLOCKSIZE_MAX; }
diff --git a/deps/SZ/zstd/legacy/zstd_v06.h b/deps/SZ/zstd/legacy/zstd_v06.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb4eb37c89e2c1f04866bb026e34899676eb8b1f
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v06.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv06_H
+#define ZSTDv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv06_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1)
+#  define ZSTDLIBv06_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv06_API
+#endif
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv06_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv06_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv06_isError())
+*/
+size_t ZSTDv06_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/* *************************************
+*  Helper functions
+***************************************/
+ZSTDLIBv06_API size_t      ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */
+
+/* Error Management */
+ZSTDLIBv06_API unsigned    ZSTDv06_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx;
+ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void);
+ZSTDLIBv06_API size_t     ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv06_decompressDCtx() :
+*   Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Dictionary API
+*************************/
+/*! ZSTDv06_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; };
+typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams;
+
+ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv06_API void   ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx);
+
+ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+/* *************************************
+*  ZBUFF API
+***************************************/
+
+typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx;
+ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void);
+ZSTDLIBv06_API size_t         ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx,
+                                                  void* dst, size_t* dstCapacityPtr,
+                                            const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv06_DCtx object is required to track streaming operations.
+*  Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+*  Use ZBUFFv06_decompressInit() to start a new decompression operation,
+*   or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv06_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+*  output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv06_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode);
+ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void);
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv06_MAGICNUMBER 0xFD2FB526   /* v0.6 */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv06_BUFFERED_H */
diff --git a/deps/SZ/zstd/legacy/zstd_v07.c b/deps/SZ/zstd/legacy/zstd_v07.c
new file mode 100644
index 0000000000000000000000000000000000000000..70b170f0f15460c73a2282736cde1df50052febd
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v07.c
@@ -0,0 +1,4502 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+#include <stdlib.h>     /* malloc, free, qsort */
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY    /* XXH64_state_t */
+#endif
+#include "xxhash.h"                  /* XXH64_* */
+#include "zstd_v07.h"
+
+#define FSEv07_STATIC_LINKING_ONLY   /* FSEv07_MIN_TABLELOG */
+#define HUFv07_STATIC_LINKING_ONLY   /* HUFv07_TABLELOG_ABSOLUTEMAX */
+#define ZSTDv07_STATIC_LINKING_ONLY
+
+#include "error_private.h"
+
+
+#ifdef ZSTDv07_STATIC_LINKING_ONLY
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Constants ---*/
+#define ZSTDv07_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTDv07_WINDOWLOG_MAX_32  25
+#define ZSTDv07_WINDOWLOG_MAX_64  27
+#define ZSTDv07_WINDOWLOG_MAX    ((U32)(MEM_32bits() ? ZSTDv07_WINDOWLOG_MAX_32 : ZSTDv07_WINDOWLOG_MAX_64))
+#define ZSTDv07_WINDOWLOG_MIN     18
+#define ZSTDv07_CHAINLOG_MAX     (ZSTDv07_WINDOWLOG_MAX+1)
+#define ZSTDv07_CHAINLOG_MIN       4
+#define ZSTDv07_HASHLOG_MAX       ZSTDv07_WINDOWLOG_MAX
+#define ZSTDv07_HASHLOG_MIN       12
+#define ZSTDv07_HASHLOG3_MAX      17
+#define ZSTDv07_SEARCHLOG_MAX    (ZSTDv07_WINDOWLOG_MAX-1)
+#define ZSTDv07_SEARCHLOG_MIN      1
+#define ZSTDv07_SEARCHLENGTH_MAX   7
+#define ZSTDv07_SEARCHLENGTH_MIN   3
+#define ZSTDv07_TARGETLENGTH_MIN   4
+#define ZSTDv07_TARGETLENGTH_MAX 999
+
+#define ZSTDv07_FRAMEHEADERSIZE_MAX 18    /* for static allocation */
+static const size_t ZSTDv07_frameHeaderSize_min = 5;
+static const size_t ZSTDv07_frameHeaderSize_max = ZSTDv07_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTDv07_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/* custom memory allocation functions */
+typedef void* (*ZSTDv07_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTDv07_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTDv07_allocFunction customAlloc; ZSTDv07_freeFunction customFree; void* opaque; } ZSTDv07_customMem;
+
+
+/*--- Advanced Decompression functions ---*/
+
+/*! ZSTDv07_estimateDCtxSize() :
+ *  Gives the potential amount of memory allocated to create a ZSTDv07_DCtx */
+ZSTDLIBv07_API size_t ZSTDv07_estimateDCtxSize(void);
+
+/*! ZSTDv07_createDCtx_advanced() :
+ *  Create a ZSTD decompression context using external alloc and free functions */
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx_advanced(ZSTDv07_customMem customMem);
+
+/*! ZSTDv07_sizeofDCtx() :
+ *  Gives the amount of memory used by a given ZSTDv07_DCtx */
+ZSTDLIBv07_API size_t ZSTDv07_sizeofDCtx(const ZSTDv07_DCtx* dctx);
+
+
+/* ******************************************************************
+*  Buffer-less streaming functions (synchronous mode)
+********************************************************************/
+
+ZSTDLIBv07_API size_t ZSTDv07_decompressBegin(ZSTDv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZSTDv07_decompressBegin_usingDict(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv07_API void   ZSTDv07_copyDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* preparedDCtx);
+
+ZSTDLIBv07_API size_t ZSTDv07_nextSrcSizeToDecompress(ZSTDv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/*
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTDv07_DCtx object is required to track streaming operations.
+  Use ZSTDv07_createDCtx() / ZSTDv07_freeDCtx() to manage it.
+  A ZSTDv07_DCtx object can be re-used multiple times.
+
+  First optional operation is to retrieve frame parameters, using ZSTDv07_getFrameParams(), which doesn't consume the input.
+  It can provide the minimum size of rolling buffer required to properly decompress data (`windowSize`),
+  and optionally the final size of uncompressed content.
+  (Note : content size is an optional info that may not be present. 0 means : content size unknown)
+  Frame parameters are extracted from the beginning of compressed frame.
+  The amount of data to read is variable, from ZSTDv07_frameHeaderSize_min to ZSTDv07_frameHeaderSize_max (so if `srcSize` >= ZSTDv07_frameHeaderSize_max, it will always work)
+  If `srcSize` is too small for operation to succeed, function will return the minimum size it requires to produce a result.
+  Result : 0 when successful, it means the ZSTDv07_frameParams structure has been filled.
+          >0 : means there is not enough data into `src`. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv07_isError()
+
+  Start decompression, with ZSTDv07_decompressBegin() or ZSTDv07_decompressBegin_usingDict().
+  Alternatively, you can copy a prepared context, using ZSTDv07_copyDCtx().
+
+  Then use ZSTDv07_nextSrcSizeToDecompress() and ZSTDv07_decompressContinue() alternatively.
+  ZSTDv07_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv07_decompressContinue().
+  ZSTDv07_decompressContinue() requires this exact amount of bytes, or it will fail.
+
+  @result of ZSTDv07_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero, which is not an error; it just means ZSTDv07_decompressContinue() has decoded some header.
+
+  ZSTDv07_decompressContinue() needs previous data blocks during decompression, up to `windowSize`.
+  They should preferably be located contiguously, prior to current block.
+  Alternatively, a round buffer of sufficient size is also possible. Sufficient size is determined by frame parameters.
+  ZSTDv07_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+    or that previous contiguous segment is large enough to properly handle maximum back-reference.
+
+  A frame is fully decoded when ZSTDv07_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow the integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by a decompressor. The format of skippable frame is following:
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTDv07_decompressContinue() always returns 0.
+  For skippable frames ZSTDv07_getFrameParams() returns fparamsPtr->windowLog==0 what means that a frame is skippable.
+  It also returns Frame Size as fparamsPtr->frameContentSize.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTDv07_createCCtx() and ZSTDv07_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv07_compressBegin()
+      + decompression : ZSTDv07_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - Block size is limited, it must be <= ZSTDv07_getBlockSizeMax()
+      + If you need to compress more, cut data into multiple blocks
+      + Consider using the regular ZSTDv07_compress() instead, as frame metadata costs become negligible when source size is large.
+    - When a block is considered not compressible enough, ZSTDv07_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv07_decompressBlock() doesn't accept uncompressed data as input !!!
+      + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history.
+        Use ZSTDv07_insertBlock() in such a case.
+*/
+
+#define ZSTDv07_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)   /* define, for static allocation */
+ZSTDLIBv07_API size_t ZSTDv07_decompressBlock(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert block into `dctx` history. Useful for uncompressed blocks */
+
+
+#endif   /* ZSTDv07_STATIC_LINKING_ONLY */
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv07_DStream_t;
+
+typedef enum { BITv07_DStream_unfinished = 0,
+               BITv07_DStream_endOfBuffer = 1,
+               BITv07_DStream_completed = 2,
+               BITv07_DStream_overflow = 3 } BITv07_DStream_status;  /* result of BITv07_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv07_initDStream(BITv07_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv07_readBits(BITv07_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv07_DStream_status BITv07_reloadDStream(BITv07_DStream_t* bitD);
+MEM_STATIC unsigned BITv07_endOfDStream(const BITv07_DStream_t* bitD);
+
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BITv07_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BITv07_initDStream() :
+*   Initialize a BITv07_DStream_t.
+*   `bitD` : a pointer to an already allocated BITv07_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv07_initDStream(BITv07_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BITv07_highbit32(lastByte) : 0;
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BITv07_highbit32(lastByte) : 0;
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+ MEM_STATIC size_t BITv07_lookBits(const BITv07_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv07_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv07_lookBitsFast(const BITv07_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv07_skipBits(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv07_readBits(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv07_lookBits(bitD, nbBits);
+    BITv07_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BITv07_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv07_lookBitsFast(bitD, nbBits);
+    BITv07_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv07_DStream_status BITv07_reloadDStream(BITv07_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should not happen => corruption detected */
+        return BITv07_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv07_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv07_DStream_endOfBuffer;
+        return BITv07_DStream_completed;
+    }
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv07_DStream_status result = BITv07_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv07_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv07_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+MEM_STATIC unsigned BITv07_endOfDStream(const BITv07_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef FSEv07_H
+#define FSEv07_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+
+/*! FSEv07_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSEv07_isError() .
+
+    ** Important ** : FSEv07_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+size_t FSEv07_decompress(void* dst,  size_t dstCapacity,
+                const void* cSrc, size_t cSrcSize);
+
+
+/* Error Management */
+unsigned    FSEv07_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv07_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSEv07_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSEv07_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSEv07_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv07_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSEv07_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv07_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv07_DTable* FSEv07_createDTable(unsigned tableLog);
+void        FSEv07_freeDTable(FSEv07_DTable* dt);
+
+/*! FSEv07_buildDTable():
+    Builds 'dt', which must be already allocated, using FSEv07_createDTable().
+    return : 0, or an errorCode, which can be tested using FSEv07_isError() */
+size_t FSEv07_buildDTable (FSEv07_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSEv07_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSEv07_isError() */
+size_t FSEv07_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv07_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSEv07_readNCount() if it was saved using FSEv07_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSEv07_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSEv07_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError().
+
+The next step is to build the decompression tables 'FSEv07_DTable' from 'normalizedCounter'.
+This is performed by the function FSEv07_buildDTable().
+The space required by 'FSEv07_DTable' must be already allocated using FSEv07_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError().
+
+`FSEv07_DTable` can then be used to decompress `cSrc`, with FSEv07_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSEv07_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError(). (ex: dst buffer too small)
+*/
+
+
+#ifdef FSEv07_STATIC_LINKING_ONLY
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSEv07_NCOUNTBOUND 512
+#define FSEv07_BLOCKBOUND(size) (size + (size>>7))
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSEv07_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+size_t FSEv07_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+/**< same as FSEv07_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr  */
+
+unsigned FSEv07_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSEv07_optimalTableLog(), which used `minus==2` */
+
+size_t FSEv07_buildDTable_raw (FSEv07_DTable* dt, unsigned nbBits);
+/**< build a fake FSEv07_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv07_buildDTable_rle (FSEv07_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSEv07_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv07_DState_t;
+
+
+static void     FSEv07_initDState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD, const FSEv07_DTable* dt);
+
+static unsigned char FSEv07_decodeSymbol(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD);
+
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSEv07_decodeSymbolFast(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv07_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv07_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv07_initDState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD, const FSEv07_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv07_DTableHeader* const DTableH = (const FSEv07_DTableHeader*)ptr;
+    DStatePtr->state = BITv07_readBits(bitD, DTableH->tableLog);
+    BITv07_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv07_peekSymbol(const FSEv07_DState_t* DStatePtr)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSEv07_updateState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BITv07_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSEv07_decodeSymbol(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv07_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSEv07_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSEv07_decodeSymbolFast(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv07_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+
+
+#ifndef FSEv07_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv07_MAX_MEMORY_USAGE 14
+#define FSEv07_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv07_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv07_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv07_FUNCTION_TYPE BYTE
+#define FSEv07_FUNCTION_EXTENSION
+#define FSEv07_DECODE_TYPE FSEv07_decode_t
+
+
+#endif   /* !FSEv07_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv07_MAX_TABLELOG  (FSEv07_MAX_MEMORY_USAGE-2)
+#define FSEv07_MAX_TABLESIZE (1U<<FSEv07_MAX_TABLELOG)
+#define FSEv07_MAXTABLESIZE_MASK (FSEv07_MAX_TABLESIZE-1)
+#define FSEv07_DEFAULT_TABLELOG (FSEv07_DEFAULT_MEMORY_USAGE-2)
+#define FSEv07_MIN_TABLELOG 5
+
+#define FSEv07_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv07_MAX_TABLELOG > FSEv07_TABLELOG_ABSOLUTE_MAX
+#  error "FSEv07_MAX_TABLELOG > FSEv07_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSEv07_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSEv07_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv07_H */
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv07_H_298734234
+#define HUFv07_H_298734234
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* *** simple functions *** */
+/**
+HUFv07_decompress() :
+    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated buffer 'dst', of minimum size 'dstSize'.
+    `dstSize` : **must** be the ***exact*** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUFv07_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize),
+              or an error code, which can be tested using HUFv07_isError()
+*/
+size_t HUFv07_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+#define HUFv07_BLOCKSIZE_MAX (128 * 1024)
+
+/* Error Management */
+unsigned    HUFv07_isError(size_t code);        /**< tells if a return value is an error code */
+const char* HUFv07_getErrorName(size_t code);   /**< provides error code string (useful for debugging) */
+
+
+/* *** Advanced function *** */
+
+
+#ifdef HUFv07_STATIC_LINKING_ONLY
+
+
+/* *** Constants *** */
+#define HUFv07_TABLELOG_ABSOLUTEMAX  16   /* absolute limit of HUFv07_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv07_TABLELOG_MAX  12           /* max configured tableLog (for static allocation); can be modified up to HUFv07_ABSOLUTEMAX_TABLELOG */
+#define HUFv07_TABLELOG_DEFAULT  11       /* tableLog by default, when not specified */
+#define HUFv07_SYMBOLVALUE_MAX 255
+#if (HUFv07_TABLELOG_MAX > HUFv07_TABLELOG_ABSOLUTEMAX)
+#  error "HUFv07_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUFv07_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUFv07_DTable;
+#define HUFv07_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUFv07_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUFv07_DTable DTable[HUFv07_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1)*0x1000001) }
+#define HUFv07_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        HUFv07_DTable DTable[HUFv07_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog)*0x1000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv07_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUFv07_decompress4X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUFv07_decompress4X_hufOnly(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUFv07_decompress4X2_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress4X4_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUFv07_decompress1X2_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress1X4_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSEv07_count() : find it within "fse.h" */
+
+/*! HUFv07_readStats() :
+    Read compact Huffman tree, saved by HUFv07_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUFv07_readCTable() and HUFv07_readDTableXn() . */
+size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+
+/*
+HUFv07_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv07_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv07_decompressSXn_usingDTable
+*/
+
+/** HUFv07_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUFv07_decompress4X2, 1==HUFv07_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUFv07_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+size_t HUFv07_readDTableX2 (HUFv07_DTable* DTable, const void* src, size_t srcSize);
+size_t HUFv07_readDTableX4 (HUFv07_DTable* DTable, const void* src, size_t srcSize);
+
+size_t HUFv07_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+
+
+/* single stream variants */
+size_t HUFv07_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv07_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv07_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+
+
+#endif /* HUFv07_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFv07_H_298734234 */
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+
+
+/*-****************************************
+*  FSE Error Management
+******************************************/
+unsigned FSEv07_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  HUF Error Management
+****************************************************************/
+unsigned HUFv07_isError(size_t code) { return ERR_isError(code); }
+
+const char* HUFv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSEv07_abs(short a) { return (short)(a<0 ? -a : a); }
+
+size_t FSEv07_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv07_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv07_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {   short const max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv07_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) && (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*! HUFv07_readStats() :
+    Read compact Huffman tree, saved by HUFv07_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUFv07_readCTable() and HUFv07_readDTableXn() .
+*/
+size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static U32 l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            {   U32 n;
+                for (n=0; n<oSize; n+=2) {
+                    huffWeight[n]   = ip[n/2] >> 4;
+                    huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv07_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv07_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv07_TABLELOG_ABSOLUTEMAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BITv07_highbit32(weightTotal) + 1;
+        if (tableLog > HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BITv07_highbit32(rest);
+            U32 const lastWeight = BITv07_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv07_isError ERR_isError
+#define FSEv07_STATIC_ASSERT(c) { enum { FSEv07_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv07_DTABLE_SIZE_U32(FSEv07_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv07_FUNCTION_EXTENSION
+#  error "FSEv07_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv07_FUNCTION_TYPE
+#  error "FSEv07_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv07_CAT(X,Y) X##Y
+#define FSEv07_FUNCTION_NAME(X,Y) FSEv07_CAT(X,Y)
+#define FSEv07_TYPE_NAME(X,Y) FSEv07_CAT(X,Y)
+
+
+/* Function templates */
+FSEv07_DTable* FSEv07_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv07_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv07_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv07_DTable*)malloc( FSEv07_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv07_freeDTable (FSEv07_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv07_buildDTable(FSEv07_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv07_DECODE_TYPE* const tableDecode = (FSEv07_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSEv07_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv07_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv07_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSEv07_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSEv07_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSEv07_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSEv07_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSEv07_FUNCTION_TYPE const symbol = (FSEv07_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BITv07_highbit32 ((U32)nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+
+#ifndef FSEv07_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv07_buildDTable_rle (FSEv07_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv07_DTableHeader* const DTableH = (FSEv07_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv07_decode_t* const cell = (FSEv07_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv07_buildDTable_raw (FSEv07_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv07_DTableHeader* const DTableH = (FSEv07_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv07_decode_t* const dinfo = (FSEv07_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv07_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv07_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv07_DStream_t bitD;
+    FSEv07_DState_t state1;
+    FSEv07_DState_t state2;
+
+    /* Init */
+    { size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+      if (FSEv07_isError(errorCode)) return errorCode; }
+
+    FSEv07_initDState(&state1, &bitD, dt);
+    FSEv07_initDState(&state2, &bitD, dt);
+
+#define FSEv07_GETSYMBOL(statePtr) fast ? FSEv07_decodeSymbolFast(statePtr, &bitD) : FSEv07_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv07_reloadDStream(&bitD)==BITv07_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv07_GETSYMBOL(&state1);
+
+        if (FSEv07_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv07_reloadDStream(&bitD);
+
+        op[1] = FSEv07_GETSYMBOL(&state2);
+
+        if (FSEv07_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv07_reloadDStream(&bitD) > BITv07_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv07_GETSYMBOL(&state1);
+
+        if (FSEv07_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv07_reloadDStream(&bitD);
+
+        op[3] = FSEv07_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv07_reloadDStream(&bitD) >= FSEv07_DStream_partiallyFilled; Ends at exactly BITv07_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv07_GETSYMBOL(&state1);
+
+        if (BITv07_reloadDStream(&bitD)==BITv07_DStream_overflow) {
+            *op++ = FSEv07_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv07_GETSYMBOL(&state2);
+
+        if (BITv07_reloadDStream(&bitD)==BITv07_DStream_overflow) {
+            *op++ = FSEv07_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSEv07_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv07_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv07_DTableHeader* DTableH = (const FSEv07_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv07_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv07_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv07_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv07_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv07_MAX_SYMBOL_VALUE;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    {   size_t const NCountLength = FSEv07_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+        if (FSEv07_isError(NCountLength)) return NCountLength;
+        if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    { size_t const errorCode = FSEv07_buildDTable (dt, counting, maxSymbolValue, tableLog);
+      if (FSEv07_isError(errorCode)) return errorCode; }
+
+    return FSEv07_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);   /* always return, even if it is an error code */
+}
+
+
+
+#endif   /* FSEv07_COMMONDEFS_ONLY */
+
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUFv07_STATIC_ASSERT(c) { enum { HUFv07_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUFv07_getDTableDesc(const HUFv07_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct { BYTE byte; BYTE nbBits; } HUFv07_DEltX2;   /* single-symbol decoding */
+
+size_t HUFv07_readDTableX2 (HUFv07_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv07_SYMBOLVALUE_MAX + 1];
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUFv07_DEltX2* const dt = (HUFv07_DEltX2*)dtPtr;
+
+    HUFv07_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUFv07_DTable));
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv07_readStats(huffWeight, HUFv07_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv07_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Prepare ranks */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 i;
+            HUFv07_DEltX2 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (i = rankVal[w]; i < rankVal[w] + length; i++)
+                dt[i] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+
+static BYTE HUFv07_decodeSymbolX2(BITv07_DStream_t* Dstream, const HUFv07_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BITv07_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BITv07_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv07_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv07_TABLELOG_MAX<=12)) \
+        HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv07_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv07_decodeStreamX2(BYTE* p, BITv07_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv07_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv07_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p < pEnd))
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+static size_t HUFv07_decompress1X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUFv07_DEltX2* const dt = (const HUFv07_DEltX2*)dtPtr;
+    BITv07_DStream_t bitD;
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    { size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv07_isError(errorCode)) return errorCode; }
+
+    HUFv07_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv07_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv07_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUFv07_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress1X2_DCtx (HUFv07_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX2 (DCtx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUFv07_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX2(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+static size_t HUFv07_decompress4X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUFv07_DEltX2* const dt = (const HUFv07_DEltX2*)dtPtr;
+
+        /* Init */
+        BITv07_DStream_t bitD1;
+        BITv07_DStream_t bitD2;
+        BITv07_DStream_t bitD3;
+        BITv07_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BITv07_initDStream(&bitD1, istart1, length1);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD2, istart2, length2);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD3, istart3, length3);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD4, istart4, length4);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv07_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv07_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv07_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv07_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv07_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv07_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv07_endOfDStream(&bitD1) & BITv07_endOfDStream(&bitD2) & BITv07_endOfDStream(&bitD3) & BITv07_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv07_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUFv07_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUFv07_decompress4X2_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX2 (dctx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUFv07_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX2(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv07_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+static void HUFv07_fillDTableX4Level2(HUFv07_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv07_DEltX4 DElt;
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    { U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }}
+}
+
+typedef U32 rankVal_t[HUFv07_TABLELOG_ABSOLUTEMAX][HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+
+static void HUFv07_fillDTableX4(HUFv07_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv07_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUFv07_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 u;
+                const U32 end = start + length;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv07_readDTableX4 (HUFv07_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv07_SYMBOLVALUE_MAX + 1];
+    sortedSymbol_t sortedSymbol[HUFv07_SYMBOLVALUE_MAX + 1];
+    U32 rankStats[HUFv07_TABLELOG_ABSOLUTEMAX + 1] = { 0 };
+    U32 rankStart0[HUFv07_TABLELOG_ABSOLUTEMAX + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUFv07_DEltX4* const dt = (HUFv07_DEltX4*)dtPtr;
+
+    HUFv07_STATIC_ASSERT(sizeof(HUFv07_DEltX4) == sizeof(HUFv07_DTable));   /* if compilation fails here, assertion is false */
+    if (maxTableLog > HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv07_readStats(weightList, HUFv07_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv07_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUFv07_fillDTableX4(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+
+static U32 HUFv07_decodeSymbolX4(void* op, BITv07_DStream_t* DStream, const HUFv07_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv07_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv07_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv07_decodeLastSymbolX4(void* op, BITv07_DStream_t* DStream, const HUFv07_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv07_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv07_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv07_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv07_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv07_TABLELOG_MAX<=12)) \
+        ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv07_decodeStreamX4(BYTE* p, BITv07_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv07_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv07_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p <= pEnd-2))
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv07_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+static size_t HUFv07_decompress1X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    BITv07_DStream_t bitD;
+
+    /* Init */
+    {   size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);
+        if (HUFv07_isError(errorCode)) return errorCode;
+    }
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUFv07_DEltX4* const dt = (const HUFv07_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        HUFv07_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BITv07_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv07_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUFv07_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress1X4_DCtx (HUFv07_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX4 (DCtx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUFv07_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX4(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+static size_t HUFv07_decompress4X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUFv07_DEltX4* const dt = (const HUFv07_DEltX4*)dtPtr;
+
+        /* Init */
+        BITv07_DStream_t bitD1;
+        BITv07_DStream_t bitD2;
+        BITv07_DStream_t bitD3;
+        BITv07_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BITv07_initDStream(&bitD1, istart1, length1);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD2, istart2, length2);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD3, istart3, length3);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD4, istart4, length4);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv07_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv07_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv07_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv07_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv07_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv07_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BITv07_endOfDStream(&bitD1) & BITv07_endOfDStream(&bitD2) & BITv07_endOfDStream(&bitD3) & BITv07_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv07_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUFv07_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUFv07_decompress4X4_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv07_readDTableX4 (dctx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUFv07_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX4(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUFv07_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUFv07_DTable* DTable)
+{
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    return dtd.tableType ? HUFv07_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUFv07_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUFv07_DTable* DTable)
+{
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    return dtd.tableType ? HUFv07_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUFv07_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUFv07_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUFv07_decompress4X2, 1==HUFv07_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUFv07_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    /* decoder timing evaluation */
+    U32 const Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    U32 const D256 = (U32)(dstSize >> 8);
+    U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+    U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+    DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, for cache eviction */
+
+    return DTime1 < DTime0;
+}
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv07_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUFv07_decompress4X2, HUFv07_decompress4X4 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+
+    //return HUFv07_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv07_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+}
+
+size_t HUFv07_decompress4X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUFv07_decompress4X_hufOnly (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) return ERROR(corruption_detected);   /* invalid */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+/*
+    Common functions of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTDv07_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv07_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTDv07_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZSTDv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+
+/* **************************************************************
+*  ZBUFF Error Management
+****************************************************************/
+unsigned ZBUFFv07_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZBUFFv07_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+
+
+void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
+{
+    void* address = malloc(size);
+    (void)opaque;
+    /* printf("alloc %p, %d opaque=%p \n", address, (int)size, opaque); */
+    return address;
+}
+
+void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
+{
+    (void)opaque;
+    /* if (address) printf("free %p opaque=%p \n", address, opaque); */
+    free(address);
+}
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : https://www.zstd.net
+*/
+#ifndef ZSTDv07_CCOMMON_H_MODULE
+#define ZSTDv07_CCOMMON_H_MODULE
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv07_OPT_NUM    (1<<12)
+#define ZSTDv07_DICT_MAGIC  0xEC30A437   /* v0.7 */
+
+#define ZSTDv07_REP_NUM    3
+#define ZSTDv07_REP_INIT   ZSTDv07_REP_NUM
+#define ZSTDv07_REP_MOVE   (ZSTDv07_REP_NUM-1)
+static const U32 repStartValue[ZSTDv07_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTDv07_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTDv07_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTDv07_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTDv07_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTDv07_blockHeaderSize = ZSTDv07_BLOCKHEADERSIZE;
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { lbt_huffman, lbt_repeat, lbt_raw, lbt_rle } litBlockType_t;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  52
+#define MaxLL  35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+
+#define FSEv07_ENCODING_RAW     0
+#define FSEv07_ENCODING_RLE     1
+#define FSEv07_ENCODING_STATIC  2
+#define FSEv07_ENCODING_DYNAMIC 3
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+static const U32 LL_defaultNormLog = 6;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+static const U32 ML_defaultNormLog = 6;
+
+static const S16 OF_defaultNorm[MaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                              1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+static const U32 OF_defaultNormLog = 5;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv07_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTDv07_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv07_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTDv07_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTDv07_stats_s ZSTDv07_stats_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTDv07_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTDv07_REP_INIT];
+} ZSTDv07_optimal_t;
+
+struct ZSTDv07_stats_s { U32 unused; };
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* litStart;
+    BYTE* lit;
+    U16*  litLengthStart;
+    U16*  litLength;
+    BYTE* llCodeStart;
+    U16*  matchLengthStart;
+    U16*  matchLength;
+    BYTE* mlCodeStart;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+    /* opt */
+    ZSTDv07_optimal_t* priceTable;
+    ZSTDv07_match_t* matchTable;
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  matchSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+    U32  log2matchLengthSum;
+    U32  log2matchSum;
+    U32  log2litLengthSum;
+    U32  log2litSum;
+    U32  log2offCodeSum;
+    U32  factor;
+    U32  cachedPrice;
+    U32  cachedLitLength;
+    const BYTE* cachedLiterals;
+    ZSTDv07_stats_t stats;
+} seqStore_t;
+
+void ZSTDv07_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq);
+
+/* custom memory allocation functions */
+static const ZSTDv07_customMem defaultCustomMem = { ZSTDv07_defaultAllocFunction, ZSTDv07_defaultFreeFunction, NULL };
+
+#endif   /* ZSTDv07_CCOMMON_H_MODULE */
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv07_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv07_HEAPMODE
+#  define ZSTDv07_HEAPMODE 1
+#endif
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#endif
+
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTDv07_isError ERR_isError   /* for inlining */
+#define FSEv07_isError  ERR_isError
+#define HUFv07_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv07_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTDv07_dStage;
+
+struct ZSTDv07_DCtx_s
+{
+    FSEv07_DTable LLTable[FSEv07_DTABLE_SIZE_U32(LLFSELog)];
+    FSEv07_DTable OffTable[FSEv07_DTABLE_SIZE_U32(OffFSELog)];
+    FSEv07_DTable MLTable[FSEv07_DTABLE_SIZE_U32(MLFSELog)];
+    HUFv07_DTable hufTable[HUFv07_DTABLE_SIZE(HufLog)];  /* can accommodate HUFv07_decompress4X */
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    U32 rep[3];
+    ZSTDv07_frameParams fParams;
+    blockType_t bType;   /* used in ZSTDv07_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv07_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    U32 dictID;
+    const BYTE* litPtr;
+    ZSTDv07_customMem customMem;
+    size_t litSize;
+    BYTE litBuffer[ZSTDv07_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv07_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTDv07_DCtx within "zstd_static.h" */
+
+int ZSTDv07_isSkipFrame(ZSTDv07_DCtx* dctx);
+
+size_t ZSTDv07_sizeofDCtx (const ZSTDv07_DCtx* dctx) { return sizeof(*dctx); }
+
+size_t ZSTDv07_estimateDCtxSize(void) { return sizeof(ZSTDv07_DCtx); }
+
+size_t ZSTDv07_decompressBegin(ZSTDv07_DCtx* dctx)
+{
+    dctx->expected = ZSTDv07_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTable[0] = (HUFv07_DTable)((HufLog)*0x1000001);
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    { int i; for (i=0; i<ZSTDv07_REP_NUM; i++) dctx->rep[i] = repStartValue[i]; }
+    return 0;
+}
+
+ZSTDv07_DCtx* ZSTDv07_createDCtx_advanced(ZSTDv07_customMem customMem)
+{
+    ZSTDv07_DCtx* dctx;
+
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    dctx = (ZSTDv07_DCtx*) customMem.customAlloc(customMem.opaque, sizeof(ZSTDv07_DCtx));
+    if (!dctx) return NULL;
+    memcpy(&dctx->customMem, &customMem, sizeof(ZSTDv07_customMem));
+    ZSTDv07_decompressBegin(dctx);
+    return dctx;
+}
+
+ZSTDv07_DCtx* ZSTDv07_createDCtx(void)
+{
+    return ZSTDv07_createDCtx_advanced(defaultCustomMem);
+}
+
+size_t ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    dctx->customMem.customFree(dctx->customMem.opaque, dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv07_copyDCtx(ZSTDv07_DCtx* dstDCtx, const ZSTDv07_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv07_DCtx) - (ZSTDv07_BLOCKSIZE_ABSOLUTEMAX+WILDCOPY_OVERLENGTH + ZSTDv07_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv07_MAGICNUMBER (defined within zstd.h)
+      - 1 byte  - Frame Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+
+/* Frame Header :
+
+   1 byte - FrameHeaderDescription :
+   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
+   bit 2   : checksumFlag
+   bit 3   : reserved (must be zero)
+   bit 4   : reserved (unused, can be any value)
+   bit 5   : Single Segment (if 1, WindowLog byte is not present)
+   bit 6-7 : FrameContentFieldSize (0, 2, 4, or 8)
+             if (SkippedWindowLog && !FrameContentFieldsize) FrameContentFieldsize=1;
+
+   Optional : WindowLog (0 or 1 byte)
+   bit 0-2 : octal Fractional (1/8th)
+   bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB)
+
+   Optional : dictID (0, 1, 2 or 4 bytes)
+   Automatic adaptation
+   0 : no dictID
+   1 : 1 - 255
+   2 : 256 - 65535
+   4 : all other values
+
+   Optional : content size (0, 1, 2, 4 or 8 bytes)
+   0 : unknown          (fcfs==0 and swl==0)
+   1 : 0-255 bytes      (fcfs==0 and swl==1)
+   2 : 256 - 65535+256  (fcfs==1)
+   4 : 0 - 4GB-1        (fcfs==2)
+   8 : 0 - 16EB-1       (fcfs==3)
+*/
+
+
+/* Compressed Block, format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+/** ZSTDv07_frameHeaderSize() :
+*   srcSize must be >= ZSTDv07_frameHeaderSize_min.
+*   @return : size of the Frame Header */
+static size_t ZSTDv07_frameHeaderSize(const void* src, size_t srcSize)
+{
+    if (srcSize < ZSTDv07_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    {   BYTE const fhd = ((const BYTE*)src)[4];
+        U32 const dictID= fhd & 3;
+        U32 const directMode = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return ZSTDv07_frameHeaderSize_min + !directMode + ZSTDv07_did_fieldSize[dictID] + ZSTDv07_fcs_fieldSize[fcsId]
+                + (directMode && !ZSTDv07_fcs_fieldSize[fcsId]);
+    }
+}
+
+
+/** ZSTDv07_getFrameParams() :
+*   decode Frame Header, or require larger `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTDv07_isError() */
+size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
+            if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
+            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+            fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
+            fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+            return 0;
+        }
+        return ERROR(prefix_unknown);
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    { size_t const fhsize = ZSTDv07_frameHeaderSize(src, srcSize);
+      if (srcSize < fhsize) return fhsize; }
+
+    {   BYTE const fhdByte = ip[4];
+        size_t pos = 5;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const directMode = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U32 const windowSizeMax = 1U << ZSTDv07_WINDOWLOG_MAX;
+        U32 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = 0;
+        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if (!directMode) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            windowSize = (1U << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+
+        switch(dictIDSizeCode)
+        {
+            default:   /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default:   /* impossible */
+            case 0 : if (directMode) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (!windowSize) windowSize = (U32)frameContentSize;
+        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        fparamsPtr->frameContentSize = frameContentSize;
+        fparamsPtr->windowSize = windowSize;
+        fparamsPtr->dictID = dictID;
+        fparamsPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+
+/** ZSTDv07_getDecompressedSize() :
+*   compatible with legacy mode
+*   @return : decompressed size if known, 0 otherwise
+              note : 0 can mean any of the following :
+                   - decompressed size is not provided within frame header
+                   - frame header unknown / not supported
+                   - frame header not completely provided (`srcSize` too small) */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
+{
+    {   ZSTDv07_frameParams fparams;
+        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+        if (frResult!=0) return 0;
+        return fparams.frameContentSize;
+    }
+}
+
+
+/** ZSTDv07_decodeFrameHeader() :
+*   `srcSize` must be the size provided by ZSTDv07_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTDv07_isError() */
+static size_t ZSTDv07_decodeFrameHeader(ZSTDv07_DCtx* dctx, const void* src, size_t srcSize)
+{
+    size_t const result = ZSTDv07_getFrameParams(&(dctx->fParams), src, srcSize);
+    if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) return ERROR(dictionary_wrong);
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return result;
+}
+
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTDv07_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    U32 cSize;
+
+    if (srcSize < ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    bpPtr->blockType = (blockType_t)((*in) >> 6);
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv07_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv07_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch((litBlockType_t)(istart[0]>> 6))
+    {
+    case lbt_huffman:
+        {   size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = (istart[0] >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv07_isError(singleStream ?
+                            HUFv07_decompress1X2_DCtx(dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv07_decompress4X_hufOnly (dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            dctx->litEntropy = 1;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case lbt_repeat:
+        {   size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (dctx->litEntropy==0)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            {   size_t const errorCode = HUFv07_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTable);
+                if (HUFv07_isError(errorCode)) return ERROR(corruption_detected);
+            }
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case lbt_raw:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case lbt_rle:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+/*! ZSTDv07_buildSeqTable() :
+    @return : nb bytes read from src,
+              or an error code if it fails, testable with ZSTDv07_isError()
+*/
+size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case FSEv07_ENCODING_RLE :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        FSEv07_buildDTable_rle(DTable, *(const BYTE*)src);   /* if *src > max, data is corrupted */
+        return 1;
+    case FSEv07_ENCODING_RAW :
+        FSEv07_buildDTable(DTable, defaultNorm, max, defaultLog);
+        return 0;
+    case FSEv07_ENCODING_STATIC:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    default :   /* impossible */
+    case FSEv07_ENCODING_DYNAMIC :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSEv07_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSEv07_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            FSEv07_buildDTable(DTable, norm, max, tableLog);
+            return headerSize;
+    }   }
+}
+
+
+size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
+                             FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    {   U32 const LLtype  = *ip >> 6;
+        U32 const OFtype = (*ip >> 4) & 3;
+        U32 const MLtype  = (*ip >> 2) & 3;
+        ip++;
+
+        /* check */
+        if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTDv07_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
+        }
+        {   size_t const ofhSize = ZSTDv07_buildSeqTable(DTableOffb, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
+        }
+        {   size_t const mlhSize = ZSTDv07_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
+    }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv07_DStream_t DStream;
+    FSEv07_DState_t stateLL;
+    FSEv07_DState_t stateOffb;
+    FSEv07_DState_t stateML;
+    size_t prevOffset[ZSTDv07_REP_INIT];
+} seqState_t;
+
+
+static seq_t ZSTDv07_decodeSequence(seqState_t* seqState)
+{
+    seq_t seq;
+
+    U32 const llCode = FSEv07_peekSymbol(&(seqState->stateLL));
+    U32 const mlCode = FSEv07_peekSymbol(&(seqState->stateML));
+    U32 const ofCode = FSEv07_peekSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                            0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            offset = OF_base[ofCode] + BITv07_readBits(&(seqState->DStream), ofBits);   /* <=  (ZSTDv07_WINDOWLOG_MAX-1) bits */
+            if (MEM_32bits()) BITv07_reloadDStream(&(seqState->DStream));
+        }
+
+        if (ofCode <= 1) {
+            if ((llCode == 0) & (offset <= 1)) offset = 1-offset;
+            if (offset) {
+                size_t const temp = seqState->prevOffset[offset];
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BITv07_readBits(&(seqState->DStream), mlBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BITv07_reloadDStream(&(seqState->DStream));
+
+    seq.litLength = LL_base[llCode] + ((llCode>15) ? BITv07_readBits(&(seqState->DStream), llBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BITv07_reloadDStream(&(seqState->DStream));
+
+    /* ANS state update */
+    FSEv07_updateState(&(seqState->stateLL), &(seqState->DStream));   /* <=  9 bits */
+    FSEv07_updateState(&(seqState->stateML), &(seqState->DStream));   /* <=  9 bits */
+    if (MEM_32bits()) BITv07_reloadDStream(&(seqState->DStream));     /* <= 18 bits */
+    FSEv07_updateState(&(seqState->stateOffb), &(seqState->DStream)); /* <=  8 bits */
+
+    return seq;
+}
+
+
+static
+size_t ZSTDv07_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend-WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if ((oLitEnd>oend_w) | (oMatchEnd>oend)) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv07_wildcopy(op, *litPtr, sequence.litLength);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv07_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv07_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTDv07_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTDv07_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv07_decompressSequences(
+                               ZSTDv07_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    FSEv07_DTable* DTableLL = dctx->LLTable;
+    FSEv07_DTable* DTableML = dctx->MLTable;
+    FSEv07_DTable* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTDv07_decodeSeqHeaders(&nbSeq, DTableLL, DTableML, DTableOffb, dctx->fseEntropy, ip, seqSize);
+        if (ZSTDv07_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTDv07_REP_INIT; i++) seqState.prevOffset[i] = dctx->rep[i]; }
+        { size_t const errorCode = BITv07_initDStream(&(seqState.DStream), ip, iend-ip);
+          if (ERR_isError(errorCode)) return ERROR(corruption_detected); }
+        FSEv07_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv07_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv07_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv07_reloadDStream(&(seqState.DStream)) <= BITv07_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            {   seq_t const sequence = ZSTDv07_decodeSequence(&seqState);
+                size_t const oneSeqSize = ZSTDv07_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                if (ZSTDv07_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTDv07_REP_INIT; i++) dctx->rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        //if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv07_checkContinuity(ZSTDv07_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv07_decompressBlock_internal(ZSTDv07_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize >= ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    {   size_t const litCSize = ZSTDv07_decodeLiteralsBlock(dctx, src, srcSize);
+        if (ZSTDv07_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+    return ZSTDv07_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv07_decompressBlock(ZSTDv07_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTDv07_checkContinuity(dctx, dst);
+    dSize = ZSTDv07_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/** ZSTDv07_insertBlock() :
+    insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    ZSTDv07_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+    if (length > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, byte, length);
+    return length;
+}
+
+
+/*! ZSTDv07_decompressFrame() :
+*   `dctx` must be properly initialized */
+static size_t ZSTDv07_decompressFrame(ZSTDv07_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t remainingSize = srcSize;
+
+    /* check */
+    if (srcSize < ZSTDv07_frameHeaderSize_min+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+        if (ZSTDv07_decodeFrameHeader(dctx, src, frameHeaderSize)) return ERROR(corruption_detected);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTDv07_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv07_blockHeaderSize;
+        remainingSize -= ZSTDv07_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv07_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv07_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTDv07_generateNxBytes(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            decodedSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (blockProperties.blockType == bt_end) break;   /* bt_end */
+
+        if (ZSTDv07_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, op, decodedSize);
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+/*! ZSTDv07_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv07_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv07_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
+size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv07_copyDCtx(dctx, refDCtx);
+    ZSTDv07_checkContinuity(dctx, dst);
+    return ZSTDv07_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv07_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv07_checkContinuity(dctx, dst);
+    return ZSTDv07_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTDv07_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTDv07_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv07_HEAPMODE) && (ZSTDv07_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv07_DCtx* const dctx = ZSTDv07_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv07_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTDv07_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTDv07_DCtx dctx;
+    return ZSTDv07_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+size_t ZSTDv07_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+
+    /* check */
+    if (srcSize < ZSTDv07_frameHeaderSize_min+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(frameHeaderSize)) return frameHeaderSize;
+        if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) return ERROR(prefix_unknown);
+        if (srcSize < frameHeaderSize+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTDv07_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv07_blockHeaderSize;
+        remainingSize -= ZSTDv07_blockHeaderSize;
+
+        if (blockProperties.blockType == bt_end) break;
+
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*_******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv07_nextSrcSizeToDecompress(ZSTDv07_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+int ZSTDv07_isSkipFrame(ZSTDv07_DCtx* dctx)
+{
+    return dctx->stage == ZSTDds_skipFrame;
+}
+
+/** ZSTDv07_decompressContinue() :
+*   @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+*             or an error code, which can be tested using ZSTDv07_isError() */
+size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    if (dstCapacity) ZSTDv07_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        if (srcSize != ZSTDv07_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
+            memcpy(dctx->headerBuffer, src, ZSTDv07_frameHeaderSize_min);
+            dctx->expected = ZSTDv07_skippableHeaderSize - ZSTDv07_frameHeaderSize_min; /* magic number + skippable frame length */
+            dctx->stage = ZSTDds_decodeSkippableHeader;
+            return 0;
+        }
+        dctx->headerSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv07_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv07_frameHeaderSize_min) {
+            dctx->expected = dctx->headerSize - ZSTDv07_frameHeaderSize_min;
+            dctx->stage = ZSTDds_decodeFrameHeader;
+            return 0;
+        }
+        dctx->expected = 0;   /* not necessary to copy more */
+	/* fall-through */
+    case ZSTDds_decodeFrameHeader:
+        {   size_t result;
+            memcpy(dctx->headerBuffer + ZSTDv07_frameHeaderSize_min, src, dctx->expected);
+            result = ZSTDv07_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv07_isError(result)) return result;
+            dctx->expected = ZSTDv07_blockHeaderSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTDv07_getcBlockSize(src, ZSTDv07_blockHeaderSize, &bp);
+            if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+            if (bp.blockType == bt_end) {
+                if (dctx->fParams.checksumFlag) {
+                    U64 const h64 = XXH64_digest(&dctx->xxhState);
+                    U32 const h32 = (U32)(h64>>11) & ((1<<22)-1);
+                    const BYTE* const ip = (const BYTE*)src;
+                    U32 const check32 = ip[2] + (ip[1] << 8) + ((ip[0] & 0x3F) << 16);
+                    if (check32 != h32) return ERROR(checksum_wrong);
+                }
+                dctx->expected = 0;
+                dctx->stage = ZSTDds_getFrameHeaderSize;
+            } else {
+                dctx->expected = cBlockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv07_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv07_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTDv07_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            if (ZSTDv07_isError(rSize)) return rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+            return rSize;
+        }
+    case ZSTDds_decodeSkippableHeader:
+        {   memcpy(dctx->headerBuffer + ZSTDv07_frameHeaderSize_min, src, dctx->expected);
+            dctx->expected = MEM_readLE32(dctx->headerBuffer + 4);
+            dctx->stage = ZSTDds_skipFrame;
+            return 0;
+        }
+    case ZSTDds_skipFrame:
+        {   dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static size_t ZSTDv07_refDictContent(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+    return 0;
+}
+
+static size_t ZSTDv07_loadEntropy(ZSTDv07_DCtx* dctx, const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    {   size_t const hSize = HUFv07_readDTableX4(dctx->hufTable, dict, dictSize);
+        if (HUFv07_isError(hSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue=MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSEv07_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSEv07_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSEv07_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    dctx->rep[0] = MEM_readLE32(dictPtr+0); if (dctx->rep[0] == 0 || dctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted);
+    dctx->rep[1] = MEM_readLE32(dictPtr+4); if (dctx->rep[1] == 0 || dctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted);
+    dctx->rep[2] = MEM_readLE32(dictPtr+8); if (dctx->rep[2] == 0 || dctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted);
+    dictPtr += 12;
+
+    dctx->litEntropy = dctx->fseEntropy = 1;
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTDv07_decompress_insertDictionary(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTDv07_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTDv07_DICT_MAGIC) {
+            return ZSTDv07_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + 4);
+
+    /* load entropy tables */
+    dict = (const char*)dict + 8;
+    dictSize -= 8;
+    {   size_t const eSize = ZSTDv07_loadEntropy(dctx, dict, dictSize);
+        if (ZSTDv07_isError(eSize)) return ERROR(dictionary_corrupted);
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+
+    /* reference dictionary content */
+    return ZSTDv07_refDictContent(dctx, dict, dictSize);
+}
+
+
+size_t ZSTDv07_decompressBegin_usingDict(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    { size_t const errorCode = ZSTDv07_decompressBegin(dctx);
+      if (ZSTDv07_isError(errorCode)) return errorCode; }
+
+    if (dict && dictSize) {
+        size_t const errorCode = ZSTDv07_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv07_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+
+struct ZSTDv07_DDict_s {
+    void* dict;
+    size_t dictSize;
+    ZSTDv07_DCtx* refContext;
+};  /* typedef'd tp ZSTDv07_CDict within zstd.h */
+
+ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
+{
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    {   ZSTDv07_DDict* const ddict = (ZSTDv07_DDict*) customMem.customAlloc(customMem.opaque, sizeof(*ddict));
+        void* const dictContent = customMem.customAlloc(customMem.opaque, dictSize);
+        ZSTDv07_DCtx* const dctx = ZSTDv07_createDCtx_advanced(customMem);
+
+        if (!dictContent || !ddict || !dctx) {
+            customMem.customFree(customMem.opaque, dictContent);
+            customMem.customFree(customMem.opaque, ddict);
+            customMem.customFree(customMem.opaque, dctx);
+            return NULL;
+        }
+
+        memcpy(dictContent, dict, dictSize);
+        {   size_t const errorCode = ZSTDv07_decompressBegin_usingDict(dctx, dictContent, dictSize);
+            if (ZSTDv07_isError(errorCode)) {
+                customMem.customFree(customMem.opaque, dictContent);
+                customMem.customFree(customMem.opaque, ddict);
+                customMem.customFree(customMem.opaque, dctx);
+                return NULL;
+        }   }
+
+        ddict->dict = dictContent;
+        ddict->dictSize = dictSize;
+        ddict->refContext = dctx;
+        return ddict;
+    }
+}
+
+/*! ZSTDv07_createDDict() :
+*   Create a digested dictionary, ready to start decompression without startup delay.
+*   `dict` can be released after `ZSTDv07_DDict` creation */
+ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTDv07_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTDv07_createDDict_advanced(dict, dictSize, allocator);
+}
+
+size_t ZSTDv07_freeDDict(ZSTDv07_DDict* ddict)
+{
+    ZSTDv07_freeFunction const cFree = ddict->refContext->customMem.customFree;
+    void* const opaque = ddict->refContext->customMem.opaque;
+    ZSTDv07_freeDCtx(ddict->refContext);
+    cFree(opaque, ddict->dict);
+    cFree(opaque, ddict);
+    return 0;
+}
+
+/*! ZSTDv07_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const ZSTDv07_DDict* ddict)
+{
+    return ZSTDv07_decompress_usingPreparedDCtx(dctx, ddict->refContext,
+                                           dst, dstCapacity,
+                                           src, srcSize);
+}
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv07_DCtx object is required to track streaming operations.
+*  Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+*  Use ZBUFFv07_decompressInit() to start a new decompression operation,
+*   or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv07_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+*  output : ZBUFFv07_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv07_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_loadHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFFv07_dStage;
+
+/* *** Resource management *** */
+struct ZBUFFv07_DCtx_s {
+    ZSTDv07_DCtx* zd;
+    ZSTDv07_frameParams fParams;
+    ZBUFFv07_dStage stage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t blockSize;
+    BYTE headerBuffer[ZSTDv07_FRAMEHEADERSIZE_MAX];
+    size_t lhSize;
+    ZSTDv07_customMem customMem;
+};   /* typedef'd to ZBUFFv07_DCtx within "zstd_buffered.h" */
+
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx_advanced(ZSTDv07_customMem customMem);
+
+ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void)
+{
+    return ZBUFFv07_createDCtx_advanced(defaultCustomMem);
+}
+
+ZBUFFv07_DCtx* ZBUFFv07_createDCtx_advanced(ZSTDv07_customMem customMem)
+{
+    ZBUFFv07_DCtx* zbd;
+
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    zbd = (ZBUFFv07_DCtx*)customMem.customAlloc(customMem.opaque, sizeof(ZBUFFv07_DCtx));
+    if (zbd==NULL) return NULL;
+    memset(zbd, 0, sizeof(ZBUFFv07_DCtx));
+    memcpy(&zbd->customMem, &customMem, sizeof(ZSTDv07_customMem));
+    zbd->zd = ZSTDv07_createDCtx_advanced(customMem);
+    if (zbd->zd == NULL) { ZBUFFv07_freeDCtx(zbd); return NULL; }
+    zbd->stage = ZBUFFds_init;
+    return zbd;
+}
+
+size_t ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* zbd)
+{
+    if (zbd==NULL) return 0;   /* support free on null */
+    ZSTDv07_freeDCtx(zbd->zd);
+    if (zbd->inBuff) zbd->customMem.customFree(zbd->customMem.opaque, zbd->inBuff);
+    if (zbd->outBuff) zbd->customMem.customFree(zbd->customMem.opaque, zbd->outBuff);
+    zbd->customMem.customFree(zbd->customMem.opaque, zbd);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    zbd->stage = ZBUFFds_loadHeader;
+    zbd->lhSize = zbd->inPos = zbd->outStart = zbd->outEnd = 0;
+    return ZSTDv07_decompressBegin_usingDict(zbd->zd, dict, dictSize);
+}
+
+size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* zbd)
+{
+    return ZBUFFv07_decompressInitDictionary(zbd, NULL, 0);
+}
+
+
+/* internal util function */
+MEM_STATIC size_t ZBUFFv07_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* const iend = istart + *srcSizePtr;
+    const char* ip = istart;
+    char* const ostart = (char*)dst;
+    char* const oend = ostart + *dstCapacityPtr;
+    char* op = ostart;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbd->stage)
+        {
+        case ZBUFFds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFds_loadHeader :
+            {   size_t const hSize = ZSTDv07_getFrameParams(&(zbd->fParams), zbd->headerBuffer, zbd->lhSize);
+                if (ZSTDv07_isError(hSize)) return hSize;
+                if (hSize != 0) {
+                    size_t const toLoad = hSize - zbd->lhSize;   /* if hSize!=0, hSize > zbd->lhSize */
+                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
+                        memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
+                        zbd->lhSize += iend-ip;
+                        *dstCapacityPtr = 0;
+                        return (hSize - zbd->lhSize) + ZSTDv07_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    memcpy(zbd->headerBuffer + zbd->lhSize, ip, toLoad); zbd->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* Consume header */
+            {   size_t const h1Size = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);  /* == ZSTDv07_frameHeaderSize_min */
+                size_t const h1Result = ZSTDv07_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer, h1Size);
+                if (ZSTDv07_isError(h1Result)) return h1Result;
+                if (h1Size < zbd->lhSize) {   /* long header */
+                    size_t const h2Size = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                    size_t const h2Result = ZSTDv07_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer+h1Size, h2Size);
+                    if (ZSTDv07_isError(h2Result)) return h2Result;
+            }   }
+
+            zbd->fParams.windowSize = MAX(zbd->fParams.windowSize, 1U << ZSTDv07_WINDOWLOG_ABSOLUTEMIN);
+
+            /* Frame header instruct buffer sizes */
+            {   size_t const blockSize = MIN(zbd->fParams.windowSize, ZSTDv07_BLOCKSIZE_ABSOLUTEMAX);
+                zbd->blockSize = blockSize;
+                if (zbd->inBuffSize < blockSize) {
+                    zbd->customMem.customFree(zbd->customMem.opaque, zbd->inBuff);
+                    zbd->inBuffSize = blockSize;
+                    zbd->inBuff = (char*)zbd->customMem.customAlloc(zbd->customMem.opaque, blockSize);
+                    if (zbd->inBuff == NULL) return ERROR(memory_allocation);
+                }
+                {   size_t const neededOutSize = zbd->fParams.windowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+                    if (zbd->outBuffSize < neededOutSize) {
+                        zbd->customMem.customFree(zbd->customMem.opaque, zbd->outBuff);
+                        zbd->outBuffSize = neededOutSize;
+                        zbd->outBuff = (char*)zbd->customMem.customAlloc(zbd->customMem.opaque, neededOutSize);
+                        if (zbd->outBuff == NULL) return ERROR(memory_allocation);
+            }   }   }
+            zbd->stage = ZBUFFds_read;
+            /* pass-through */
+	    /* fall-through */
+        case ZBUFFds_read:
+            {   size_t const neededInSize = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                if (neededInSize==0) {  /* end of frame */
+                    zbd->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    const int isSkipFrame = ZSTDv07_isSkipFrame(zbd->zd);
+                    size_t const decodedSize = ZSTDv07_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, (isSkipFrame ? 0 : zbd->outBuffSize - zbd->outStart),
+                        ip, neededInSize);
+                    if (ZSTDv07_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize && !isSkipFrame) break;   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbd->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {   size_t const neededInSize = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                size_t const toLoad = neededInSize - zbd->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbd->inBuffSize - zbd->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv07_limitCopy(zbd->inBuff + zbd->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbd->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {  const int isSkipFrame = ZSTDv07_isSkipFrame(zbd->zd);
+                   size_t const decodedSize = ZSTDv07_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        zbd->inBuff, neededInSize);
+                    if (ZSTDv07_isError(decodedSize)) return decodedSize;
+                    zbd->inPos = 0;   /* input is consumed */
+                    if (!decodedSize && !isSkipFrame) { zbd->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    /* break; */
+                    /* pass-through */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {   size_t const toFlushSize = zbd->outEnd - zbd->outStart;
+                size_t const flushedSize = ZBUFFv07_limitCopy(op, oend-op, zbd->outBuff + zbd->outStart, toFlushSize);
+                op += flushedSize;
+                zbd->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbd->stage = ZBUFFds_read;
+                    if (zbd->outStart + zbd->blockSize > zbd->outBuffSize)
+                        zbd->outStart = zbd->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    *srcSizePtr = ip-istart;
+    *dstCapacityPtr = op-ostart;
+    {   size_t nextSrcSizeHint = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+        nextSrcSizeHint -= zbd->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFFv07_recommendedDInSize(void)  { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX + ZSTDv07_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv07_recommendedDOutSize(void) { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX; }
diff --git a/deps/SZ/zstd/legacy/zstd_v07.h b/deps/SZ/zstd/legacy/zstd_v07.h
new file mode 100644
index 0000000000000000000000000000000000000000..6591cd3014b07bf850d6d1c4499a907cea34859a
--- /dev/null
+++ b/deps/SZ/zstd/legacy/zstd_v07.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv07_H_235446
+#define ZSTDv07_H_235446
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv07_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1)
+#  define ZSTDLIBv07_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv07_API
+#endif
+
+
+/* *************************************
+*  Simple API
+***************************************/
+/*! ZSTDv07_getDecompressedSize() :
+*   @return : decompressed size if known, 0 otherwise.
+       note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause.
+       note 2 : decompressed size could be wrong or intentionally modified !
+                always ensure results fit within application's authorized limits */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTDv07_decompress() :
+    `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail.
+    `dstCapacity` must be equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv07_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv07_isError())
+*/
+size_t ZSTDv07_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/*======  Helper functions  ======*/
+ZSTDLIBv07_API unsigned    ZSTDv07_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code);     /*!< provides readable string from an error code */
+
+
+/*-*************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx;
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void);
+ZSTDLIBv07_API size_t     ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv07_decompressDCtx() :
+*   Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-************************
+*  Simple dictionary API
+***************************/
+/*! ZSTDv07_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression.
+*   Note : This function load the dictionary, resulting in a significant startup time */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-**************************
+*  Advanced Dictionary API
+****************************/
+/*! ZSTDv07_createDDict() :
+*   Create a digested dictionary, ready to start decompression operation without startup delay.
+*   `dict` can be released after creation */
+typedef struct ZSTDv07_DDict_s ZSTDv07_DDict;
+ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize);
+ZSTDLIBv07_API size_t      ZSTDv07_freeDDict(ZSTDv07_DDict* ddict);
+
+/*! ZSTDv07_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+                                                    void* dst, size_t dstCapacity,
+                                              const void* src, size_t srcSize,
+                                              const ZSTDv07_DDict* ddict);
+
+typedef struct {
+    unsigned long long frameContentSize;
+    unsigned windowSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTDv07_frameParams;
+
+ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+
+
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx;
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void);
+ZSTDLIBv07_API size_t      ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv07_DCtx object is required to track streaming operations.
+*  Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+*  Use ZBUFFv07_decompressInit() to start a new decompression operation,
+*   or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv07_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+*  output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv07_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode);
+ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void);
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv07_MAGICNUMBER            0xFD2FB527   /* v0.7 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv07_H_235446 */
diff --git a/deps/SZ/zstd/zstd.h b/deps/SZ/zstd/zstd.h
new file mode 100644
index 0000000000000000000000000000000000000000..73821751338edb2056a9909d8c527e6859fb3bf9
--- /dev/null
+++ b/deps/SZ/zstd/zstd.h
@@ -0,0 +1,1468 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <stddef.h>   /* size_t */
+
+void showme();
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm,
+  targeting real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression functions.
+  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
+  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+  The compression ratio achievable on small data can be highly improved using a dictionary in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
+
+  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
+  Advanced experimental APIs shall never be used with a dynamic library.
+  They are not "stable", their definition may change in the future. Only static linking is allowed.
+*********************************************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    3
+#define ZSTD_VERSION_RELEASE  5
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+
+/***************************************
+*  Default constant
+***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : added in v1.3.0
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution in multi-threaded environments. */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()) */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/**********************************
+ *  Bulk processing dictionary API
+ *********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+ *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression level is decided during dictionary creation.
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream in situations where many streaming operations will be achieved consecutively,
+*  since it will play nicer with system's memory, by re-using already allocated memory.
+*  Use one separate ZSTD_CStream per thread for parallel execution.
+*
+*  Start a new compression by initializing ZSTD_CStream context.
+*  Use ZSTD_initCStream() to start a new compression operation.
+*  Use variants ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for streaming with dictionary (experimental section)
+*
+*  Use ZSTD_compressStream() as many times as necessary to consume input stream.
+*  The function will automatically update both `pos` fields within `input` and `output`.
+*  Note that the function may not consume the entire input,
+*  for example, because the output buffer is already full,
+*  in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  typically by emptying output buffer, or allocating a new output buffer,
+*  and then present again remaining input data.
+*  @return : a size hint, preferred nb of bytes to use as input for next function call
+*            or an error code, which can be tested using ZSTD_isError().
+*            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
+*            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_flushStream(). `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation of ZSTD_flushStream() might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_flushStream().
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  ZSTD_endStream() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush() operation is the same, and follows same rules as ZSTD_flushStream().
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block in all circumstances. */
+
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation,
+*   or ZSTD_initDStream_usingDict() if decompression requires a dictionary.
+*   @return : recommended first input size
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            an error code, which can be tested using ZSTD_isError(),
+*            any other value > 0, which means there is still some decoding to do to complete current frame.
+*            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, continue to consider them separated. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+#endif  /* ZSTD_H_235446 */
+
+
+
+/****************************************************************************************
+ * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * They are provided for advanced scenarios.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_WINDOWLOG_MAX_32   30
+#define ZSTD_WINDOWLOG_MAX_64   31
+#define ZSTD_WINDOWLOG_MAX    ((unsigned)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN      10
+#define ZSTD_HASHLOG_MAX      ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN         6
+#define ZSTD_CHAINLOG_MAX_32    29
+#define ZSTD_CHAINLOG_MAX_64    30
+#define ZSTD_CHAINLOG_MAX     ((unsigned)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN       ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX       17
+#define ZSTD_SEARCHLOG_MAX     (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN       1
+#define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_LDM_MINMATCH_MIN    4
+#define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN    6
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* for static allocation */
+static const size_t ZSTD_frameHeaderSize_prefix = ZSTD_FRAMEHEADERSIZE_PREFIX;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/*--- Advanced types ---*/
+typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
+               ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
+
+typedef struct {
+    unsigned windowLog;      /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;       /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;        /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;      /**< nb of searches : larger == more compression, slower */
+    unsigned searchLength;   /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;   /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+typedef struct {
+    unsigned contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    unsigned checksumFlag;    /**< 1: generate a 32-bits checksum at end of frame, for error detection */
+    unsigned noDictIDFlag;    /**< 1: no dictID will be saved into frame header (if dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef enum {
+    ZSTD_dct_auto=0,      /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent,  /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict     /* refuses to load a dictionary if it does not respect Zstandard's specification */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef,      /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findFrameCompressedSize() :
+ *  `src` should point to the start of a ZSTD encoded frame or skippable frame
+ *  `srcSize` must be >= first frame size
+ *  @return : the compressed size of the first frame starting at `src`,
+ *            suitable to pass to `ZSTD_decompress` or similar,
+ *            or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary exactly at `srcSize` bytes after `src`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the current memory usage of selected object.
+ *  Object memory usage can evolve when re-used. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *  ZSTD_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CCtx size estimation is only correct for single-threaded compression. */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_customMem customMem);
+
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is simply referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+*   `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+*   All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+*   Ensure param values remain within authorized range */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ *  both values are optional, select `0` if unknown. */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+*   Same as ZSTD_compress_usingDict(), with fine-tune control over each compression parameter */
+ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const void* dict,size_t dictSize,
+                                  ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+*   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_CDict* cdict, ZSTD_frameParameters fParams);
+
+
+/*--- Advanced decompression functions ---*/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/********************************************************************
+*  Advanced streaming functions
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */
+ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/
+ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
+                                             ZSTD_parameters params, unsigned long long pledgedSrcSize);  /**< pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. dict is loaded with ZSTD_dm_auto and ZSTD_dlm_byCopy. */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);  /**< note : cdict will just be referenced, and must outlive compression session */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize);  /**< same as ZSTD_initCStream_usingCDict(), with control over frame parameters. pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. */
+
+/*! ZSTD_resetCStream() :
+ *  start a new compression job, using same parameters from previous job.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;
+    unsigned long long consumed;
+    unsigned long long produced;
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
+ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);   /* obsolete : this API will be removed in a future version */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: no dictionary will be used if dict == NULL or dictSize < 8 */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  /**< note : ddict is referenced, it must outlive decompression session */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);  /**< re-use decompression parameters from previous init; saves dictionary loading */
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+/* ============================================ */
+/**       New advanced API (experimental)       */
+/* ============================================ */
+
+/* API design :
+ *   In this advanced API, parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are applied to next job, and any subsequent job.
+ *   It's possible to reset parameters to "default" using ZSTD_CCtx_reset().
+ *   Important : "sticky" parameters only work with `ZSTD_compress_generic()` !
+ *               For any other entry point, "sticky" parameters are ignored !
+ *
+ *   This API is intended to replace all others advanced / experimental API entry points.
+ */
+
+/* note on enum design :
+ * All enum will be pinned to explicit values before reaching "stable API" status */
+
+typedef enum {
+    /* Opened question : should we have a format ZSTD_f_auto ?
+     * Today, it would mean exactly the same as ZSTD_f_zstd1.
+     * But, in the future, should several formats become supported,
+     * on the compression side, it would mean "default format".
+     * On the decompression side, it would mean "automatic format detection",
+     * so that ZSTD_f_zstd1 would mean "accept *only* zstd frames".
+     * Since meaning is a little different, another option could be to define different enums for compression and decompression.
+     * This question could be kept for later, when there are actually multiple formats to support,
+     * but there is also the question of pinning enum values, and pinning value `0` is especially important */
+    ZSTD_f_zstd1 = 0,        /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless,  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                              * Useful to save 4 bytes per generated frame.
+                              * Decoder cannot recognise automatically this format, requiring instructions. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* compression format */
+    ZSTD_p_format = 10,      /* See ZSTD_format_e enum definition.
+                              * Cast selected format as unsigned for ZSTD_CCtx_setParameter() compatibility. */
+
+    /* compression parameters */
+    ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
+                              * Note 2 : setting a level sets all default values of other compression parameters.
+                              * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
+    ZSTD_p_windowLog,        /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
+                              *       requires explicitly allowing such window size during decompression stage. */
+    ZSTD_p_hashLog,          /* Size of the initial probe table, as a power of 2.
+                              * Resulting table size is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_p_chainLog,         /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting table size is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless when using "fast" strategy.
+                              * Note it's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_p_searchLog,        /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless when using "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_p_minMatch,         /* Minimum size of searched matches (note : repCode matches can be smaller).
+                              * Larger values make faster compression and decompression, but decrease ratio.
+                              * Must be clamped between ZSTD_SEARCHLENGTH_MIN and ZSTD_SEARCHLENGTH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_p_targetLength,     /* Impact of this field depends on strategy.
+                              * For strategies btopt & btultra:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_p_compressionStrategy, /* See ZSTD_strategy enum definition.
+                              * Cast selected strategy as unsigned for ZSTD_CCtx_setParameter() compatibility.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    ZSTD_p_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                         * This parameter is designed to improve compression ratio
+                                         * for large inputs, by finding large matches at long distance.
+                                         * It increases memory usage and window size.
+                                         * Note: enabling this parameter increases ZSTD_p_windowLog to 128 MB
+                                         * except when expressly set to a different value. */
+    ZSTD_p_ldmHashLog,       /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_p_ldmMinMatch,      /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_p_ldmBucketSizeLog, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX .
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_p_ldmHashEveryLog,  /* Frequency of inserting/looking up entries in the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashEveryLog". */
+
+    /* frame parameters */
+    ZSTD_p_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression,
+                              * it is provided using ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_p_checksumFlag,     /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_p_dictIDFlag,       /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD).
+     * They return an error otherwise. */
+    ZSTD_p_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode :
+                              * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_p_jobSize,          /* Size of a compression job. This value is enforced only in non-blocking mode.
+                              * Each compression job is completed in parallel, so this value indirectly controls the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlapSize, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced */
+    ZSTD_p_overlapSizeLog,   /* Size of previous input reloaded at the beginning of each job.
+                              * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */
+
+    /* =================================================================== */
+    /* experimental parameters - no stability guaranteed                   */
+    /* =================================================================== */
+
+    ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
+                              * even when referencing into Dictionary content (default:0) */
+    ZSTD_p_forceAttachDict,  /* ZSTD supports usage of a CDict in-place
+                              * (avoiding having to copy the compression tables
+                              * from the CDict into the working context). Using
+                              * a CDict in this way saves an initial setup step,
+                              * but comes at the cost of more work per byte of
+                              * input. ZSTD has a simple internal heuristic that
+                              * guesses which strategy will be faster. You can
+                              * use this flag to override that guess.
+                              *
+                              * Note that the by-reference, in-place strategy is
+                              * only used when reusing a compression context
+                              * with compatible compression parameters. (If
+                              * incompatible / uninitialized, the working
+                              * context needs to be cleared anyways, which is
+                              * about as expensive as overwriting it with the
+                              * dictionary context, so there's no savings in
+                              * using the CDict by-ref.)
+                              *
+                              * Values greater than 0 force attaching the dict.
+                              * Values less than 0 force copying the dict.
+                              * 0 selects the default heuristic-guided behavior.
+                              */
+
+} ZSTD_cParameter;
+
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbThreads >= 1),
+ *              following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active on next job, or after a flush().
+ *  Note : when `value` type is not unsigned (int, or enum), cast it to unsigned for proper type checking.
+ *  @result : informational value (typically, value being set, correctly clamped),
+ *            or an error code (which can be tested with ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
+
+/*! ZSTD_CCtx_getParameter() :
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  This value will be controlled at the end, and result in error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : 0 means zero, empty.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new compression job.
+ *  Note 2 : If all data is provided and consumed in a single round,
+ *           this value is overriden by srcSize instead. */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary will be used for all future compression jobs.
+ *           To return to "no-dictionary" situation, load a NULL dictionary
+ *  Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
+ *           For this reason, compression parameters cannot be changed anymore after loading a dictionary.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use ZSTD_CCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compression jobs.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supercede any compression parameter previously set within CCtx.
+ *  The dictionary will remain valid for future compression jobs using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : adding a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Adding a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  Decompression need same prefix to properly regenerate data.
+ *  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
+ *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
+ *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize,
+                                       ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_reset() :
+ *  Return a CCtx to clean state.
+ *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
+ *  Any internal data not yet flushed is cancelled.
+ *  The parameters and dictionary are kept unchanged, to reset them use ZSTD_CCtx_resetParameters().
+ */
+ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);
+
+/*! ZSTD_CCtx_resetParameters() :
+ *  All parameters are back to default values (compression level is ZSTD_CLEVEL_DEFAULT).
+ *  Dictionary (if any) is dropped.
+ *  Resetting parameters is only possible during frame initialization (before starting compression).
+ *  To reset the context use ZSTD_CCtx_reset().
+ *  @return 0 or an error code (which can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx);
+
+
+
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal conditions */
+    ZSTD_e_flush,      /* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */
+    ZSTD_e_end         /* flush any remaining data and close current frame. Any additional data starts a new frame. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compress_generic() :
+ *  Behave about the same as ZSTD_compressStream. To note :
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_setParameter()
+ *  - Compression parameters cannot be changed once compression is started.
+ *  - outpot->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - outpot->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - In single-thread mode (default), function is blocking : it completed its job before returning to caller.
+ *  - In multi-thread mode, function is non-blocking : it just acquires a copy of input, and distribute job to internal worker threads,
+ *                                                     and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                     The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : in multi-threading mode, if the first call requests a ZSTD_e_end directive, it is blocking : it will complete compression before giving back control to caller.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+                                          ZSTD_inBuffer* input,
+                                          ZSTD_EndDirective endOp);
+
+
+/*! ZSTD_compress_generic_simpleArgs() :
+ *  Same as ZSTD_compress_generic(),
+ *  but using only integral types as arguments.
+ *  Argument list is larger than ZSTD_{in,out}Buffer,
+ *  but can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compress_generic_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParam_setParameter() : Push parameters one by one into
+ *                                    an existing ZSTD_CCtx_params structure.
+ *                                    This is similar to
+ *                                    ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent compression jobs.
+ *  - ZSTD_compress_generic() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+
+/*! ZSTD_CCtxParam_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value);
+
+/*! ZSTD_CCtxParam_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParam_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+
+/* ==================================== */
+/*===   Advanced decompression API   ===*/
+/* ==================================== */
+
+/* The following API works the same way as the advanced compression API :
+ * a context is created, parameters are pushed into it one by one,
+ * then the context can be used to decompress data using an interface similar to the straming API.
+ */
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : `dict` content will be copied internally.
+ *            Use ZSTD_DCtx_loadDictionary_byReference()
+ *            to reference dictionary content instead.
+ *            In which case, the dictionary buffer must outlive its users.
+ *  Note 2 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to select
+ *           how dictionary content will be interpreted and loaded.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special : adding a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression job.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_DCtx_decompress_generic() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A fulldict prefix is more costly though.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize,
+                                    ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This is useful to prevent a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in direct mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_MAX)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
+                        const void* src, size_t srcSize, ZSTD_format_e format);
+
+
+/*! ZSTD_decompress_generic() :
+ *  Behave the same as ZSTD_decompressStream.
+ *  Decompression parameters cannot be changed once decompression is started.
+ * @return : an error code, which can be tested using ZSTD_isError()
+ *           if >0, a hint, nb of expected input bytes for next invocation.
+ *           `0` means : a frame has just been fully decoded and flushed.
+ */
+ZSTDLIB_API size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input);
+
+
+/*! ZSTD_decompress_generic_simpleArgs() :
+ *  Same as ZSTD_decompress_generic(),
+ *  but using only integral types as arguments.
+ *  Argument list is larger than ZSTD_{in,out}Buffer,
+ *  but can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompress_generic_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  If a decompression was ongoing, any internal data not yet flushed is cancelled.
+ *  All parameters are back to default values, including sticky ones.
+ *  Dictionary (if any) is dropped.
+ *  Parameters can be modified again after a reset.
+ */
+ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DCtx* dctx);
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block size, consider using the regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index fb43751b9e8fd715d538abb1198e1bdfd0a2e9ae..e4c276de0251b31b590bebdd82e973b099068a97 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -5,13 +5,14 @@ INCLUDE_DIRECTORIES(inc)
 INCLUDE_DIRECTORIES(jni)
 INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/query/inc)
 AUX_SOURCE_DIRECTORY(src SRC)
+link_directories(/share/lib)
 
 IF (TD_LINUX)
   INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/deps/jni/linux)
 
   # set the static lib name
   ADD_LIBRARY(taos_static STATIC ${SRC})
-  TARGET_LINK_LIBRARIES(taos_static common query trpc tutil pthread m rt)
+  TARGET_LINK_LIBRARIES(taos_static common query trpc tutil pthread m rt SZ)
   SET_TARGET_PROPERTIES(taos_static PROPERTIES OUTPUT_NAME "taos_static")
   SET_TARGET_PROPERTIES(taos_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
 
diff --git a/src/common/src/ttypes.c b/src/common/src/ttypes.c
index 34dda32401b604450b3179cf0f344f53b6e2cb51..fd6403c518224d66a41821bf5a6e229719f01940 100644
--- a/src/common/src/ttypes.c
+++ b/src/common/src/ttypes.c
@@ -374,8 +374,8 @@ tDataTypeDescriptor tDataTypes[15] = {
   {TSDB_DATA_TYPE_SMALLINT,  8,  SHORT_BYTES,  "SMALLINT",           tsCompressSmallint,  tsDecompressSmallint,  getStatics_i16},
   {TSDB_DATA_TYPE_INT,       3,  INT_BYTES,    "INT",                tsCompressInt,       tsDecompressInt,       getStatics_i32},
   {TSDB_DATA_TYPE_BIGINT,    6,  LONG_BYTES,   "BIGINT",             tsCompressBigint,    tsDecompressBigint,    getStatics_i64},
-  {TSDB_DATA_TYPE_FLOAT,     5,  FLOAT_BYTES,  "FLOAT",              tsCompressFloat,     tsDecompressFloat,     getStatics_f},
-  {TSDB_DATA_TYPE_DOUBLE,    6,  DOUBLE_BYTES, "DOUBLE",             tsCompressDouble,    tsDecompressDouble,    getStatics_d},
+  {TSDB_DATA_TYPE_FLOAT,     5,  FLOAT_BYTES,  "FLOAT",              tsCompressFloatLossy,  tsDecompressFloatLossy,     getStatics_f},
+  {TSDB_DATA_TYPE_DOUBLE,    6,  DOUBLE_BYTES, "DOUBLE",             tsCompressDoubleLossy, tsDecompressDoubleLossy,    getStatics_d},
   {TSDB_DATA_TYPE_BINARY,    6,  0,      "BINARY",             tsCompressString,    tsDecompressString,    getStatics_bin},
   {TSDB_DATA_TYPE_TIMESTAMP, 9,  LONG_BYTES,   "TIMESTAMP",          tsCompressTimestamp, tsDecompressTimestamp, getStatics_i64},
   {TSDB_DATA_TYPE_NCHAR,     5,  8,      "NCHAR",              tsCompressString,    tsDecompressString,    getStatics_nchr},
@@ -383,6 +383,7 @@ tDataTypeDescriptor tDataTypes[15] = {
   {TSDB_DATA_TYPE_USMALLINT, 17, SHORT_BYTES,  "SMALLINT UNSIGNED",  tsCompressSmallint,  tsDecompressSmallint,  getStatics_u16},
   {TSDB_DATA_TYPE_UINT,      12, INT_BYTES,    "INT UNSIGNED",       tsCompressInt,       tsDecompressInt,       getStatics_u32},
   {TSDB_DATA_TYPE_UBIGINT,   15, LONG_BYTES,   "BIGINT UNSIGNED",    tsCompressBigint,    tsDecompressBigint,    getStatics_u64},
+
 };
 
 char tTokenTypeSwitcher[13] = {
diff --git a/src/kit/taospack/CMakeLists.txt b/src/kit/taospack/CMakeLists.txt
index e03c00fabfbb6cb5bd3507c1d2ba0c249a867c2d..dc3d436a71ed5889b9326f5d9b835398dc97b960 100644
--- a/src/kit/taospack/CMakeLists.txt
+++ b/src/kit/taospack/CMakeLists.txt
@@ -8,7 +8,7 @@ INCLUDE_DIRECTORIES(inc)
 IF (TD_LINUX)
   AUX_SOURCE_DIRECTORY(. SRC)
   ADD_EXECUTABLE(taospack ${SRC})
-  TARGET_LINK_LIBRARIES(taospack os tutil)
+  TARGET_LINK_LIBRARIES(taospack os tutil tsdb SZ)
 ELSEIF (TD_WINDOWS)
   AUX_SOURCE_DIRECTORY(. SRC)
   ADD_EXECUTABLE(taospack ${SRC})
diff --git a/src/tsdb/src/tsdbMain.c b/src/tsdb/src/tsdbMain.c
index d44f8ec74874fa990992fc008671878088d872f0..32748a5a3798f84ba495cd675112c5111750c595 100644
--- a/src/tsdb/src/tsdbMain.c
+++ b/src/tsdb/src/tsdbMain.c
@@ -15,6 +15,7 @@
 
 // no test file errors here
 #include "tsdbint.h"
+#include "tscompression.h"
 
 #define IS_VALID_PRECISION(precision) \
   (((precision) >= TSDB_TIME_PRECISION_MILLI) && ((precision) <= TSDB_TIME_PRECISION_NANO))
@@ -66,6 +67,9 @@ STsdbRepo *tsdbOpenRepo(STsdbCfg *pCfg, STsdbAppH *pAppH) {
 
   terrno = TSDB_CODE_SUCCESS;
 
+  // Compress Init
+  tsCompressInit();
+
   // Check and set default configurations
   if (tsdbCheckAndSetDefaultCfg(&config) < 0) {
     tsdbError("vgId:%d failed to open TSDB repository since %s", config.tsdbId, tstrerror(terrno));
@@ -139,6 +143,9 @@ int tsdbCloseRepo(STsdbRepo *repo, int toCommit) {
   tsdbFreeRepo(pRepo);
   tsdbDebug("vgId:%d repository is closed", vgId);
 
+  // compress exit
+  tsCompressExit();
+
   if (terrno != TSDB_CODE_SUCCESS) {
     return -1;
   } else {
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index e8a1d61ee52c6461e88f6cdc16069b2b6b523ab5..fd9dd7dd26164adc9b867e1200ac7389eca8b652 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -6,7 +6,9 @@ INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/sync/inc)
 INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/deps/rmonotonic/inc)
 AUX_SOURCE_DIRECTORY(src SRC)
 ADD_LIBRARY(tutil ${SRC})
-TARGET_LINK_LIBRARIES(tutil pthread os lz4 z rmonotonic)
+TARGET_LINK_LIBRARIES(tutil pthread os lz4 z rmonotonic SZ )
+
+
 
 IF (TD_LINUX)
   TARGET_LINK_LIBRARIES(tutil m rt)
diff --git a/src/util/inc/tscompression.h b/src/util/inc/tscompression.h
index cca6d6e25054e7736ea28590fa4cb5ab788d6b07..54a11e7f21c9b663052b395d1a1106c800718ce3 100644
--- a/src/util/inc/tscompression.h
+++ b/src/util/inc/tscompression.h
@@ -47,6 +47,9 @@ extern int tsDecompressDoubleImp(const char *const input, const int nelements, c
 extern int tsCompressFloatImp(const char *const input, const int nelements, char *const output);
 extern int tsDecompressFloatImp(const char *const input, const int nelements, char *const output);
 
+bool tsCompressInit();
+void tsCompressExit();
+
 static FORCE_INLINE int tsCompressTinyint(const char *const input, int inputSize, const int nelements, char *const output, int outputSize, char algorithm,
                       char *const buffer, int bufferSize) {
   if (algorithm == ONE_STAGE_COMP) {
@@ -239,6 +242,27 @@ static FORCE_INLINE int tsDecompressDouble(const char *const input, int compress
   }
 }
 
+static FORCE_INLINE int tsCompressFloatLossy(const char *const input, int inputSize, const int nelements, char *const output, int outputSize,
+                    char algorithm, char *const buffer, int bufferSize) {
+                      return -1;
+                    }
+
+static FORCE_INLINE int tsDecompressFloatLossy(const char *const input, int compressedSize, const int nelements, char *const output,
+                      int outputSize, char algorithm, char *const buffer, int bufferSize){
+                        return -1;
+                      }
+
+static FORCE_INLINE int tsCompressDoubleLossy(const char *const input, int inputSize, const int nelements, char *const output, int outputSize,
+                     char algorithm, char *const buffer, int bufferSize){
+                       return -1;
+                     }
+
+static FORCE_INLINE int tsDecompressDoubleLossy(const char *const input, int compressedSize, const int nelements, char *const output,
+                       int outputSize, char algorithm, char *const buffer, int bufferSize){
+                         return -1;
+                       }
+
+
 static FORCE_INLINE int tsCompressTimestamp(const char *const input, int inputSize, const int nelements, char *const output, int outputSize,
                         char algorithm, char *const buffer, int bufferSize) {
   if (algorithm == ONE_STAGE_COMP) {
diff --git a/src/util/src/tcompression.c b/src/util/src/tcompression.c
index 1de6e76f7150b85dd804fbe6cbfa7cb3b1487895..2d917d361d9139d0960b91ce51e02784675fcff1 100644
--- a/src/util/src/tcompression.c
+++ b/src/util/src/tcompression.c
@@ -886,3 +886,57 @@ int tsDecompressFloatImp(const char *const input, const int nelements, char *con
 
   return nelements * FLOAT_BYTES;
 }
+
+//
+// ----------- global init and exit resource ------
+//
+int SZ_Init(const char *configFilePath);
+
+
+bool tsCompressInit() {
+  int i = 7*9;
+  i %= 10;
+  if (i > 10000) {
+  tsCompressFloatLossy(NULL, 0, 0, NULL, 0, 0, NULL, 0);
+  tsDecompressFloatLossy(NULL, 0, 0, NULL, 0, 0, NULL, 0);
+  tsCompressDoubleLossy(NULL, 0, 0, NULL, 0, 0, NULL, 0);
+  tsDecompressDoubleLossy(NULL, 0, 0, NULL, 0, 0, NULL, 0); 
+  tsCompressExit();
+ 
+  }
+
+  SZ_Init("./sz.config");
+  return true;
+}
+
+void tsCompressExit(){
+  tsCompressInit();
+}
+
+//
+//   ----------  float double lossy  -----------
+//
+
+/*
+static int  tsCompressFloatLossy(const char *const input, int inputSize, const int nelements, char *const output, int outputSize,
+                    char algorithm, char *const buffer, int bufferSize) {
+
+    return -1;
+}
+
+static int tsDecompressFloatLossy(const char *const input, int compressedSize, const int nelements, char *const output,
+                      int outputSize, char algorithm, char *const buffer, int bufferSize) {
+
+    return -1;
+}
+
+static int tsCompressDoubleLossy(const char *const input, int inputSize, const int nelements, char *const output, int outputSize,
+                     char algorithm, char *const buffer, int bufferSize) {
+    return -1;
+}
+
+static int tsDecompressDoubleLossy(const char *const input, int compressedSize, const int nelements, char *const output,
+                       int outputSize, char algorithm, char *const buffer, int bufferSize) {
+    return -1;
+}
+*/
\ No newline at end of file
diff --git a/tests/examples/c/subscribe.c b/tests/examples/c/subscribe.c
index ad12f0e7a55b0f471f249f92f30cf659c94586a5..e158aa2da90de03fc744db9c98b454d2513be5ea 100644
--- a/tests/examples/c/subscribe.c
+++ b/tests/examples/c/subscribe.c
@@ -7,257 +7,135 @@
 #include <taos.h>  // include TDengine header file
 #include <unistd.h>
 
-int nTotalRows;
 
-void print_result(TAOS_RES* res, int blockFetch) {
-  TAOS_ROW    row = NULL;
-  int         num_fields = taos_num_fields(res);
-  TAOS_FIELD* fields = taos_fetch_fields(res);
-  int         nRows = 0;
-  
-  if (blockFetch) {
-    nRows = taos_fetch_block(res, &row);
-    //for (int i = 0; i < nRows; i++) {
-    //  taos_print_row(buf, row + i, fields, num_fields);
-    //  puts(buf);
-    //}
-  } else {
-    while ((row = taos_fetch_row(res))) {
-      char buf[4096] = {0};
-      taos_print_row(buf, row, fields, num_fields);
-      puts(buf);
-      nRows++;
+void showme();
+float calculate_delta_t(size_t size);
+int is_lossless_compressed_data(unsigned char* compressedBytes, size_t cmpSize);
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <taos.h>  // TAOS header file
+
+static void queryDB(TAOS *taos, char *command) {
+    
+  printf("aaa");
+  /*
+    int i;
+  TAOS_RES *pSql = NULL;
+  int32_t   code = -1;
+
+  for (i = 0; i < 5; i++) {
+    if (NULL != pSql) {
+      taos_free_result(pSql);
+      pSql = NULL;
+    }
+    
+    pSql = taos_query(taos, command);
+    code = taos_errno(pSql);
+    if (0 == code) {
+      break;
     }
   }
 
-  nTotalRows += nRows;
-  printf("%d rows consumed.\n", nRows);
-}
-
-
-void subscribe_callback(TAOS_SUB* tsub, TAOS_RES *res, void* param, int code) {
-  print_result(res, *(int*)param);
-}
-
-
-void check_row_count(int line, TAOS_RES* res, int expected) {
-  int actual = 0;
-  TAOS_ROW    row;
-  while ((row = taos_fetch_row(res))) {
-    actual++;
-  }
-  if (actual != expected) {
-    printf("line %d: row count mismatch, expected: %d, actual: %d\n", line, expected, actual);
-  } else {
-    printf("line %d: %d rows consumed as expected\n", line, actual);
+  if (code != 0) {
+    fprintf(stderr, "Failed to run %s, reason: %s\n", command, taos_errstr(pSql));
+    taos_free_result(pSql);
+    taos_close(taos);
+    exit(EXIT_FAILURE);
   }
-}
-
-
-void do_query(TAOS* taos, const char* sql) {
-  TAOS_RES* res = taos_query(taos, sql);
-  taos_free_result(res);
-}
-
-
-void run_test(TAOS* taos) {
-  do_query(taos, "drop database if exists test;");
-  
-  usleep(100000);
-  do_query(taos, "create database test;");
-  usleep(100000);
-  do_query(taos, "use test;");
-
-  usleep(100000);
-  do_query(taos, "create table meters(ts timestamp, a int) tags(area int);");
-
-  do_query(taos, "create table t0 using meters tags(0);");
-  do_query(taos, "create table t1 using meters tags(1);");
-  do_query(taos, "create table t2 using meters tags(2);");
-  do_query(taos, "create table t3 using meters tags(3);");
-  do_query(taos, "create table t4 using meters tags(4);");
-  do_query(taos, "create table t5 using meters tags(5);");
-  do_query(taos, "create table t6 using meters tags(6);");
-  do_query(taos, "create table t7 using meters tags(7);");
-  do_query(taos, "create table t8 using meters tags(8);");
-  do_query(taos, "create table t9 using meters tags(9);");
-
-  do_query(taos, "insert into t0 values('2020-01-01 00:00:00.000', 0);");
-  do_query(taos, "insert into t0 values('2020-01-01 00:01:00.000', 0);");
-  do_query(taos, "insert into t0 values('2020-01-01 00:02:00.000', 0);");
-  do_query(taos, "insert into t1 values('2020-01-01 00:00:00.000', 0);");
-  do_query(taos, "insert into t1 values('2020-01-01 00:01:00.000', 0);");
-  do_query(taos, "insert into t1 values('2020-01-01 00:02:00.000', 0);");
-  do_query(taos, "insert into t1 values('2020-01-01 00:03:00.000', 0);");
-  do_query(taos, "insert into t2 values('2020-01-01 00:00:00.000', 0);");
-  do_query(taos, "insert into t2 values('2020-01-01 00:01:00.000', 0);");
-  do_query(taos, "insert into t2 values('2020-01-01 00:01:01.000', 0);");
-  do_query(taos, "insert into t2 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t3 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t4 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t5 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t6 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t7 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t8 values('2020-01-01 00:01:02.000', 0);");
-  do_query(taos, "insert into t9 values('2020-01-01 00:01:02.000', 0);");
-
-  // super tables subscription
-  usleep(1000000);
-
-  TAOS_SUB* tsub = taos_subscribe(taos, 0, "test", "select * from meters;", NULL, NULL, 0);
-  TAOS_RES* res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 18);
-
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 0);
-
-  do_query(taos, "insert into t0 values('2020-01-01 00:02:00.001', 0);");
-  do_query(taos, "insert into t8 values('2020-01-01 00:01:03.000', 0);");
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 2);
-
-  do_query(taos, "insert into t2 values('2020-01-01 00:01:02.001', 0);");
-  do_query(taos, "insert into t1 values('2020-01-01 00:03:00.001', 0);");
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 2);
-
-  do_query(taos, "insert into t1 values('2020-01-01 00:03:00.002', 0);");
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 1);
-
-  // keep progress information and restart subscription
-  taos_unsubscribe(tsub, 1);
-  do_query(taos, "insert into t0 values('2020-01-01 00:04:00.000', 0);");
-  tsub = taos_subscribe(taos, 1, "test", "select * from meters;", NULL, NULL, 0);
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 24);
 
-  // keep progress information and continue previous subscription
-  taos_unsubscribe(tsub, 1);
-  tsub = taos_subscribe(taos, 0, "test", "select * from meters;", NULL, NULL, 0);
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 0);
-
-  // don't keep progress information and continue previous subscription
-  taos_unsubscribe(tsub, 0);
-  tsub = taos_subscribe(taos, 0, "test", "select * from meters;", NULL, NULL, 0);
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 24);
-
-  // single meter subscription
-
-  taos_unsubscribe(tsub, 0);
-  tsub = taos_subscribe(taos, 0, "test", "select * from t0;", NULL, NULL, 0);
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 5);
-
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 0);
-
-  do_query(taos, "insert into t0 values('2020-01-01 00:04:00.001', 0);");
-  res = taos_consume(tsub);
-  check_row_count(__LINE__, res, 1);
-
-  taos_unsubscribe(tsub, 0);
+  taos_free_result(pSql);
+   */
 }
 
+void Test(TAOS *taos, char *qstr, int i);
 
 int main(int argc, char *argv[]) {
-  const char* host = "127.0.0.1";
-  const char* user = "root";
-  const char* passwd = "taosdata";
-  const char* sql = "select * from meters;";
-  const char* topic = "test-multiple";
-  int async = 1, restart = 0, keep = 1, test = 0, blockFetch = 0;
-
-  for (int i = 1; i < argc; i++) {
-    if (strncmp(argv[i], "-h=", 3) == 0) {
-      host = argv[i] + 3;
-      continue;
-    }
-    if (strncmp(argv[i], "-u=", 3) == 0) {
-      user = argv[i] + 3;
-      continue;
-    }
-    if (strncmp(argv[i], "-p=", 3) == 0) {
-      passwd = argv[i] + 3;
-      continue;
-    }
-    if (strcmp(argv[i], "-sync") == 0) {
-      async = 0;
-      continue;
-    }
-    if (strcmp(argv[i], "-restart") == 0) {
-      restart = 1;
-      continue;
-    }
-    if (strcmp(argv[i], "-single") == 0) {
-      sql = "select * from t0;";
-      topic = "test-single";
-      continue;
-    }
-    if (strcmp(argv[i], "-nokeep") == 0) {
-      keep = 0;
-      continue;
-    }
-    if (strncmp(argv[i], "-sql=", 5) == 0) {
-      sql = argv[i] + 5;
-      topic = "test-custom";
-      continue;
-    }
-    if (strcmp(argv[i], "-test") == 0) {
-      test = 1;
-      continue;
-    }
-    if (strcmp(argv[i], "-block-fetch") == 0) {
-      blockFetch = 1;
-      continue;
-    }
+  //char      qstr[1024];
+    
+ is_lossless_compressed_data(NULL,0);
+
+  // connect to server
+  if (argc < 2) {
+    printf("please input server-ip \n");
+    return 0;
   }
 
-  TAOS* taos = taos_connect(host, user, passwd, "", 0);
+  TAOS *taos = taos_connect(argv[1], "root", "taosdata", NULL, 0);
   if (taos == NULL) {
-    printf("failed to connect to db, reason:%s\n", taos_errstr(taos));
+    printf("failed to connect to server, reason:%s\n", "null taos"/*taos_errstr(taos)*/);
     exit(1);
   }
-
-  if (test) {
-    run_test(taos);
-    taos_close(taos);
-    exit(0);
-  }
-
-  taos_select_db(taos, "test");
-  TAOS_SUB* tsub = NULL;
-  if (async) {
-    // create an asynchronized subscription, the callback function will be called every 1s
-    tsub = taos_subscribe(taos, restart, topic, sql, subscribe_callback, &blockFetch, 1000);
-  } else {
-    // create an synchronized subscription, need to call 'taos_consume' manually
-    tsub = taos_subscribe(taos, restart, topic, sql, NULL, NULL, 0);
+  /*
+  for (int i = 0; i < 100; i++) {
+    Test(taos, qstr, i);
   }
-
-  if (tsub == NULL) {
-    printf("failed to create subscription.\n");
-    exit(0);
-  } 
-
-  if (async) {
-    getchar();
-  } else while(1) {
-    TAOS_RES* res = taos_consume(tsub);
-    if (res == NULL) {
-      printf("failed to consume data.");
-      break;
+  taos_close(taos);
+  taos_cleanup();
+   */
+}
+void Test(TAOS *taos, char *qstr, int index)  {
+    
+  printf("==================test at %d\n================================", index);
+  
+  queryDB(taos, "drop database if exists demo");
+  queryDB(taos, "create database demo");
+  //TAOS_RES *result;
+  queryDB(taos, "use demo");
+
+  queryDB(taos, "create table m1 (ts timestamp, ti tinyint, si smallint, i int, bi bigint, f float, d double, b binary(10))");
+  printf("success to create table\n");
+
+     /*
+  int i = 0;
+  for (i = 0; i < 10; ++i) {
+    sprintf(qstr, "insert into m1 values (%" PRId64 ", %d, %d, %d, %d, %f, %lf, '%s')", (uint64_t)(1546300800000 + i * 1000), i, i, i, i*10000000, i*1.0, i*2.0, "hello");
+    printf("qstr: %s\n", qstr);
+    
+    // note: how do you wanna do if taos_query returns non-NULL
+    // if (taos_query(taos, qstr)) {
+    //   printf("insert row: %i, reason:%s\n", i, taos_errstr(taos));
+    // }
+    TAOS_RES *result1 = taos_query(taos, qstr);
+    if (result1 == NULL || taos_errno(result1) != 0) {
+      printf("failed to insert row, reason:%s\n", taos_errstr(result1));
+      taos_free_result(result1);
+      exit(1);
     } else {
-      print_result(res, blockFetch);
-      getchar();
+      printf("insert row: %i\n", i);
     }
+    taos_free_result(result1);
+  }
+  printf("success to insert rows, total %d rows\n", i);
+
+  // query the records
+  sprintf(qstr, "SELECT * FROM m1");
+  result = taos_query(taos, qstr);
+  if (result == NULL || taos_errno(result) != 0) {
+    printf("failed to select, reason:%s\n", taos_errstr(result));
+    taos_free_result(result);
+    exit(1);
   }
 
-  printf("total rows consumed: %d\n", nTotalRows);
-  taos_unsubscribe(tsub, keep);
-  taos_close(taos);
+  TAOS_ROW    row;
+  int         rows = 0;
+  int         num_fields = taos_field_count(result);
+  TAOS_FIELD *fields = taos_fetch_fields(result);
+
+  printf("num_fields = %d\n", num_fields);
+  printf("select * from table, result:\n");
+  // fetch the records row by row
+  while ((row = taos_fetch_row(result))) {
+    char temp[1024] = {0};
+    rows++;
+    taos_print_row(temp, row, fields, num_fields);
+    printf("%s\n", temp);
+  }
 
-  return 0;
+  taos_free_result(result);
+  printf("====demo end====\n\n");
+      */
 }
+