From 8dd9d6b1f6ccc2742692f81e78f3f1c326e4af84 Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Mon, 28 May 2018 20:00:51 +0800
Subject: [PATCH] Add kernels benchmark against eigen

---
 WORKSPACE                                     |   34 +-
 mace/kernels/BUILD                            |   89 +-
 mace/kernels/matmul_benchmark.cc              |   95 +
 mace/kernels/memory_benchmark.cc              |  111 +
 third_party/eigen3/BUILD                      |   33 +
 third_party/eigen3/Eigen/Cholesky             |    1 +
 third_party/eigen3/Eigen/Core                 |    1 +
 third_party/eigen3/Eigen/Eigenvalues          |    1 +
 third_party/eigen3/Eigen/LU                   |    1 +
 third_party/eigen3/Eigen/QR                   |    1 +
 third_party/eigen3/Eigen/SVD                  |    1 +
 third_party/eigen3/LICENSE                    | 1936 +++++++++++++++++
 third_party/eigen3/eigen.BUILD                |   71 +
 .../eigen3/unsupported/Eigen/CXX11/Core       |   46 +
 .../eigen3/unsupported/Eigen/CXX11/FixedPoint |   55 +
 .../unsupported/Eigen/CXX11/NeuralNetworks    |   35 +
 .../eigen3/unsupported/Eigen/CXX11/Tensor     |   15 +
 .../eigen3/unsupported/Eigen/CXX11/ThreadPool |    1 +
 .../CXX11/src/FixedPoint/FixedPointTypes.h    |  342 +++
 .../CXX11/src/FixedPoint/MatMatProduct.h      |  255 +++
 .../CXX11/src/FixedPoint/MatMatProductAVX2.h  | 1754 +++++++++++++++
 .../CXX11/src/FixedPoint/MatMatProductNEON.h  |   95 +
 .../CXX11/src/FixedPoint/MatVecProduct.h      |  123 ++
 .../CXX11/src/FixedPoint/PacketMathAVX2.h     |  476 ++++
 .../CXX11/src/FixedPoint/PacketMathAVX512.h   |  545 +++++
 .../CXX11/src/FixedPoint/TypeCastingAVX2.h    |   66 +
 .../CXX11/src/FixedPoint/TypeCastingAVX512.h  |  180 ++
 .../CXX11/src/NeuralNetworks/Activations.h    |  116 +
 .../CXX11/src/NeuralNetworks/Attention.h      |  209 ++
 .../BackwardCuboidConvolutions.h              |  523 +++++
 .../BackwardSpatialConvolutions.h             |  351 +++
 .../src/NeuralNetworks/CuboidConvolution.h    |  179 ++
 .../Eigen/CXX11/src/NeuralNetworks/Patch3d.h  |  240 ++
 .../Eigen/CXX11/src/NeuralNetworks/Pooling.h  |  433 ++++
 .../Eigen/CXX11/src/NeuralNetworks/SoftMax.h  |   83 +
 .../src/NeuralNetworks/SpatialConvolutions.h  |  775 +++++++
 .../NeuralNetworks/TensorConvolutionByFFT.h   |  289 +++
 .../eigen3/unsupported/Eigen/SpecialFunctions |    1 +
 third_party/gemmlowp/LICENSE                  |  202 ++
 39 files changed, 9739 insertions(+), 25 deletions(-)
 create mode 100644 mace/kernels/matmul_benchmark.cc
 create mode 100644 mace/kernels/memory_benchmark.cc
 create mode 100644 third_party/eigen3/BUILD
 create mode 100644 third_party/eigen3/Eigen/Cholesky
 create mode 100644 third_party/eigen3/Eigen/Core
 create mode 100644 third_party/eigen3/Eigen/Eigenvalues
 create mode 100644 third_party/eigen3/Eigen/LU
 create mode 100644 third_party/eigen3/Eigen/QR
 create mode 100644 third_party/eigen3/Eigen/SVD
 create mode 100644 third_party/eigen3/LICENSE
 create mode 100644 third_party/eigen3/eigen.BUILD
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/Core
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/Tensor
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
 create mode 100644 third_party/eigen3/unsupported/Eigen/SpecialFunctions
 create mode 100644 third_party/gemmlowp/LICENSE

diff --git a/WORKSPACE b/WORKSPACE
index 9b620936..e62557bd 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -9,7 +9,7 @@ http_archive(
     strip_prefix = "protobuf-3.4.0",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0.zip",
-        "https://github.com/google/protobuf/archive/v3.4.0.zip"
+        "https://github.com/google/protobuf/archive/v3.4.0.zip",
     ],
 )
 
@@ -20,7 +20,7 @@ new_http_archive(
     strip_prefix = "googletest-release-1.8.0",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/googletest/googletest-release-1.8.0.zip",
-        "https://github.com/google/googletest/archive/release-1.8.0.zip"
+        "https://github.com/google/googletest/archive/release-1.8.0.zip",
     ],
 )
 
@@ -31,7 +31,7 @@ new_http_archive(
     strip_prefix = "OpenCL-Headers-master",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-Headers/OpenCL-Headers-master.zip",
-        "https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip"
+        "https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip",
     ],
 )
 
@@ -42,7 +42,7 @@ new_http_archive(
     strip_prefix = "OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
-        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip"
+        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
     ],
 )
 
@@ -53,7 +53,29 @@ new_http_archive(
     strip_prefix = "half-code-356-trunk",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-code-356-trunk.zip",
-        "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip"
+        "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip",
+    ],
+)
+
+new_http_archive(
+    name = "eigen",
+    build_file = "third_party/eigen3/eigen.BUILD",
+    sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
+    strip_prefix = "eigen-eigen-f3a22f35b044",
+    urls = [
+        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/eigen/f3a22f35b044.tar.gz",
+        "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+        "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "gemmlowp",
+    sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
+    strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
+    urls = [
+        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+        "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
     ],
 )
 
@@ -81,7 +103,7 @@ http_archive(
     strip_prefix = "gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e",
     urls = [
         "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gflags/gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip",
-        "https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip"
+        "https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip",
     ],
 )
 
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index 3e837d85..a1200cfa 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -18,14 +18,17 @@ cc_library(
         ],
         exclude = [
             "*_test.cc",
+            "*_benchmark.cc",
             "arm/*_test.cc",
         ],
-    ) + if_android(glob([
+    ) + if_android(glob(
+        [
             "opencl/*.cc",
         ],
         exclude = [
             "opencl/*_test.cc",
-        ])),
+        ],
+    )),
     hdrs = glob(
         [
             "*.h",
@@ -35,16 +38,26 @@ cc_library(
             "buffer_to_image.h",
         ],
     ) + if_android(glob([
-            "opencl/*.h",
-            "buffer_to_image.h",
-        ])),
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
-      if_openmp_enabled(["-fopenmp"]) +
-      if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
-      if_android_armv7(["-mfpu=neon"]) +
-      if_android_armv7(["-mfloat-abi=softfp"]) +
-      if_android(["-DMACE_ENABLE_OPENCL"]) +
-      if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
+        "opencl/*.h",
+        "buffer_to_image.h",
+    ])),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
+    ]) + if_android([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
     linkopts = if_android(["-lm"]),
     deps = [
         "//mace/core",
@@ -62,13 +75,22 @@ cc_test(
             "opencl/*_test.cc",
         ],
     ),
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
-      if_openmp_enabled(["-fopenmp"]) +
-      if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
-      if_android_armv7(["-mfpu=neon"]) +
-      if_android_armv7(["-mfloat-abi=softfp"]) +
-      if_android(["-DMACE_ENABLE_OPENCL"]) +
-      if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_android([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
     deps = [
@@ -77,3 +99,32 @@ cc_test(
         "@gtest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "kernels_benchmark",
+    testonly = 1,
+    srcs = glob(["*_benchmark.cc"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_android([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    linkopts = ["-fopenmp"],
+    linkstatic = 1,
+    deps = [
+        ":kernels",
+        "//mace/core:test_benchmark_main",
+        "//third_party/eigen3",
+    ],
+)
diff --git a/mace/kernels/matmul_benchmark.cc b/mace/kernels/matmul_benchmark.cc
new file mode 100644
index 00000000..d06ff317
--- /dev/null
+++ b/mace/kernels/matmul_benchmark.cc
@@ -0,0 +1,95 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/kernels/gemm.h"
+#include "public/gemmlowp.h"
+
+namespace mace {
+namespace kernels {
+namespace test {
+
+// Test the speed of different access order of a NHWC buffer
+
+namespace {
+
+// Matmul with (m, k) x (k, n)
+void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  std::vector<float> lhs(m * k);
+  std::vector<float> rhs(k * n);
+  std::vector<float> result(m * n);
+  // warm up
+  Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
+  mace::testing::StartTiming();
+  while (iters--) {
+    Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
+  }
+}
+
+void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  Eigen::MatrixXd lhs = Eigen::MatrixXd::Random(m, k);
+  Eigen::MatrixXd rhs = Eigen::MatrixXd::Random(k, n);
+  Eigen::MatrixXd result = Eigen::MatrixXd::Zero(m, n);
+  // warm up
+  result = lhs * rhs;
+  mace::testing::StartTiming();
+  while (iters--) {
+    result = lhs * rhs;
+  }
+}
+
+}  // namespace
+
+#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC)                         \
+  static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
+    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
+    const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
+    mace::testing::MaccProcessed(macc);                            \
+    mace::testing::BytesProcessed(tot * sizeof(float));            \
+    MatmulBenchmark_##FUNC(iters, M, K, N);                        \
+  }                                                                \
+  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
+
+#define MACE_BM_MATMUL(M, K, N)        \
+  MACE_BM_MATMUL_FUNC(M, K, N, Mace);  \
+  MACE_BM_MATMUL_FUNC(M, K, N, Eigen);
+
+// Embedding size 384
+MACE_BM_MATMUL(7, 384, 384);
+MACE_BM_MATMUL(7, 384, 1536);
+MACE_BM_MATMUL(7, 1536, 384);
+
+MACE_BM_MATMUL(15, 384, 384);
+MACE_BM_MATMUL(15, 384, 1536);
+MACE_BM_MATMUL(15, 1536, 384);
+
+MACE_BM_MATMUL(1, 384, 384);
+MACE_BM_MATMUL(1, 384, 1536);
+MACE_BM_MATMUL(1, 1536, 384);
+MACE_BM_MATMUL(1, 384, 44678);
+
+// Embedding size 128
+MACE_BM_MATMUL(1, 128, 1536);
+MACE_BM_MATMUL(1, 128, 44678);
+
+}  // namespace test
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/memory_benchmark.cc b/mace/kernels/memory_benchmark.cc
new file mode 100644
index 00000000..5d9ab1f4
--- /dev/null
+++ b/mace/kernels/memory_benchmark.cc
@@ -0,0 +1,111 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "mace/core/testing/test_benchmark.h"
+
+namespace mace {
+namespace kernels {
+namespace test {
+
+// Test the speed of different access order of a NHWC buffer
+
+namespace {
+void MemoryAccessBenchmark_NHWC(
+    int iters, int batch, int height, int width, int channels) {
+  mace::testing::StopTiming();
+  std::vector<float> buffer(batch * height * width * channels);
+  std::fill_n(buffer.begin(), buffer.size(), 0.1);
+  mace::testing::StartTiming();
+
+  while (iters--) {
+    for (int n = 0; n < batch; ++n) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          for (int c = 0; c < channels; ++c) {
+            buffer[n * height * width * channels + h * width * channels +
+                   w * channels + c] = 1.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+void MemoryAccessBenchmark_NWCH(
+    int iters, int batch, int height, int width, int channels) {
+  mace::testing::StopTiming();
+  std::vector<float> buffer(batch * height * width * channels);
+  std::fill_n(buffer.begin(), buffer.size(), 0.1);
+  mace::testing::StartTiming();
+
+  while (iters--) {
+    for (int n = 0; n < batch; ++n) {
+      for (int w = 0; w < width; ++w) {
+        for (int c = 0; c < channels; ++c) {
+          for (int h = 0; h < height; ++h) {
+            buffer[n * height * width * channels + h * width * channels +
+                   w * channels + c] = 1.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+void MemoryAccessBenchmark_NHCW(
+    int iters, int batch, int height, int width, int channels) {
+  mace::testing::StopTiming();
+  std::vector<float> buffer(batch * height * width * channels);
+  std::fill_n(buffer.begin(), buffer.size(), 0.1);
+  mace::testing::StartTiming();
+
+  while (iters--) {
+    for (int n = 0; n < batch; ++n) {
+      for (int h = 0; h < height; ++h) {
+        for (int c = 0; c < channels; ++c) {
+          for (int w = 0; w < width; ++w) {
+            buffer[n * height * width * channels + h * width * channels +
+                   w * channels + c] = 1.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+#define MACE_BM_MEMORY_ACCESS(N, H, W, C, ORDER)                     \
+  static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
+      int iters) {                                                   \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    mace::testing::MaccProcessed(tot);                               \
+    mace::testing::BytesProcessed(tot * sizeof(float));              \
+    MemoryAccessBenchmark_##ORDER(iters, N, H, W, C);                \
+  }                                                                  \
+  MACE_BENCHMARK(MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER)
+
+MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHWC);
+MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHCW);
+MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NWCH);
+MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NHCW);
+MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NWCH);
+
+}  // namespace test
+}  // namespace kernels
+}  // namespace mace
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
new file mode 100644
index 00000000..89e25857
--- /dev/null
+++ b/third_party/eigen3/BUILD
@@ -0,0 +1,33 @@
+# Description:
+#   Eigen is a C++ template library for linear algebra: vectors,
+#   matrices, and related algorithms.
+# This file is mostly stolen from tensorflow.
+
+licenses([
+    # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
+    #       We've taken special care to not reference any restricted code.
+    "reciprocal",  # MPL2
+    "notice",  # Portions BSD
+])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "eigen3",
+    hdrs = glob(["unsupported/Eigen/CXX11/src/FixedPoint/*.h"]) + [
+        "Eigen/Core",
+        "Eigen/LU",
+        "Eigen/Cholesky",
+        "Eigen/Eigenvalues",
+        "Eigen/QR",
+        "Eigen/SVD",
+        "unsupported/Eigen/SpecialFunctions",
+        "unsupported/Eigen/CXX11/ThreadPool",
+        "unsupported/Eigen/CXX11/Tensor",
+        "unsupported/Eigen/CXX11/FixedPoint",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@eigen//:eigen",
+    ],
+)
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
new file mode 100644
index 00000000..c199a025
--- /dev/null
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -0,0 +1 @@
+#include "Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
new file mode 100644
index 00000000..d4b03677
--- /dev/null
+++ b/third_party/eigen3/Eigen/Core
@@ -0,0 +1 @@
+#include "Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
new file mode 100644
index 00000000..bf739b9b
--- /dev/null
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -0,0 +1 @@
+#include "Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
new file mode 100644
index 00000000..536149ce
--- /dev/null
+++ b/third_party/eigen3/Eigen/LU
@@ -0,0 +1 @@
+#include "Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
new file mode 100644
index 00000000..be067d3e
--- /dev/null
+++ b/third_party/eigen3/Eigen/QR
@@ -0,0 +1 @@
+#include "Eigen/QR"
diff --git a/third_party/eigen3/Eigen/SVD b/third_party/eigen3/Eigen/SVD
new file mode 100644
index 00000000..eecf47c1
--- /dev/null
+++ b/third_party/eigen3/Eigen/SVD
@@ -0,0 +1 @@
+#include "Eigen/SVD"
diff --git a/third_party/eigen3/LICENSE b/third_party/eigen3/LICENSE
new file mode 100644
index 00000000..a25d8e6f
--- /dev/null
+++ b/third_party/eigen3/LICENSE
@@ -0,0 +1,1936 @@
+Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
+  http://www.mozilla.org/MPL/2.0/
+  http://www.mozilla.org/MPL/2.0/FAQ.html
+
+Some files contain third-party code under BSD or LGPL licenses, whence
+the other COPYING.* files here.
+
+All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later.
+For this reason, the COPYING.LGPL file contains the LGPL 2.1 text.
+
+If you want to guarantee that the Eigen code that you are #including
+is licensed under the MPL2 and possibly more permissive licenses (like
+BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY 
+For example, with most compilers, you could add this to your project
+      CXXFLAGS: -DEIGEN_MPL2_ONLY 
+This will cause a compilation error to be generated if you #include
+any code that is LGPL licensed.
+
+----------------------------------------------------------------------
+Following applies to:
+./test/mapstaticmethods.cpp
+./test/schur_real.cpp
+./test/prec_inverse_4x4.cpp
+./test/smallvectors.cpp
+./test/redux.cpp
+./test/special_numbers.cpp
+./test/adjoint.cpp
+./test/resize.cpp
+./test/mixingtypes.cpp
+./test/product_trmv.cpp
+./test/sparse_solvers.cpp
+./test/cholesky.cpp
+./test/geo_quaternion.cpp
+./test/miscmatrices.cpp
+./test/stddeque.cpp
+./test/integer_types.cpp
+./test/product_large.cpp
+./test/eigensolver_generic.cpp
+./test/householder.cpp
+./test/geo_orthomethods.cpp
+./test/array_for_matrix.cpp
+./test/sparseLM.cpp
+./test/upperbidiagonalization.cpp
+./test/nomalloc.cpp
+./test/packetmath.cpp
+./test/jacobisvd.cpp
+./test/geo_transformations.cpp
+./test/swap.cpp
+./test/eigensolver_selfadjoint.cpp
+./test/inverse.cpp
+./test/product_selfadjoint.cpp
+./test/product_trsolve.cpp
+./test/product_extra.cpp
+./test/sparse_solver.h
+./test/mapstride.cpp
+./test/mapped_matrix.cpp
+./test/geo_eulerangles.cpp
+./test/eigen2support.cpp
+./test/denseLM.cpp
+./test/stdvector.cpp
+./test/nesting_ops.cpp
+./test/sparse_permutations.cpp
+./test/zerosized.cpp
+./test/exceptions.cpp
+./test/vectorwiseop.cpp
+./test/cwiseop.cpp
+./test/basicstuff.cpp
+./test/product_trmm.cpp
+./test/linearstructure.cpp
+./test/sparse_product.cpp
+./test/stdvector_overload.cpp
+./test/stable_norm.cpp
+./test/umeyama.cpp
+./test/unalignedcount.cpp
+./test/triangular.cpp
+./test/product_mmtr.cpp
+./test/sparse_basic.cpp
+./test/sparse_vector.cpp
+./test/meta.cpp
+./test/real_qz.cpp
+./test/ref.cpp
+./test/eigensolver_complex.cpp
+./test/cholmod_support.cpp
+./test/conjugate_gradient.cpp
+./test/sparse.h
+./test/simplicial_cholesky.cpp
+./test/bicgstab.cpp
+./test/dynalloc.cpp
+./test/product_notemporary.cpp
+./test/geo_hyperplane.cpp
+./test/lu.cpp
+./test/qr.cpp
+./test/hessenberg.cpp
+./test/sizeof.cpp
+./test/main.h
+./test/selfadjoint.cpp
+./test/permutationmatrices.cpp
+./test/superlu_support.cpp
+./test/qtvector.cpp
+./test/geo_homogeneous.cpp
+./test/determinant.cpp
+./test/array_reverse.cpp
+./test/unalignedassert.cpp
+./test/stdlist.cpp
+./test/product_symm.cpp
+./test/corners.cpp
+./test/dontalign.cpp
+./test/visitor.cpp
+./test/geo_alignedbox.cpp
+./test/diagonalmatrices.cpp
+./test/product_small.cpp
+./test/eigensolver_generalized_real.cpp
+./test/umfpack_support.cpp
+./test/first_aligned.cpp
+./test/qr_fullpivoting.cpp
+./test/array_replicate.cpp
+./test/geo_parametrizedline.cpp
+./test/eigen2/eigen2_unalignedassert.cpp
+./test/eigen2/eigen2_prec_inverse_4x4.cpp
+./test/eigen2/eigen2_alignedbox.cpp
+./test/eigen2/eigen2_sparse_product.cpp
+./test/eigen2/eigen2_meta.cpp
+./test/eigen2/eigen2_nomalloc.cpp
+./test/eigen2/eigen2_visitor.cpp
+./test/eigen2/eigen2_packetmath.cpp
+./test/eigen2/eigen2_svd.cpp
+./test/eigen2/eigen2_mixingtypes.cpp
+./test/eigen2/eigen2_qr.cpp
+./test/eigen2/eigen2_cwiseop.cpp
+./test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
+./test/eigen2/eigen2_smallvectors.cpp
+./test/eigen2/eigen2_commainitializer.cpp
+./test/eigen2/eigen2_sparse_solvers.cpp
+./test/eigen2/eigen2_hyperplane.cpp
+./test/eigen2/eigen2_eigensolver.cpp
+./test/eigen2/eigen2_linearstructure.cpp
+./test/eigen2/eigen2_sizeof.cpp
+./test/eigen2/eigen2_parametrizedline.cpp
+./test/eigen2/eigen2_lu.cpp
+./test/eigen2/eigen2_adjoint.cpp
+./test/eigen2/eigen2_geometry.cpp
+./test/eigen2/eigen2_stdvector.cpp
+./test/eigen2/eigen2_newstdvector.cpp
+./test/eigen2/eigen2_submatrices.cpp
+./test/eigen2/sparse.h
+./test/eigen2/eigen2_swap.cpp
+./test/eigen2/eigen2_triangular.cpp
+./test/eigen2/eigen2_basicstuff.cpp
+./test/eigen2/gsl_helper.h
+./test/eigen2/eigen2_dynalloc.cpp
+./test/eigen2/eigen2_array.cpp
+./test/eigen2/eigen2_map.cpp
+./test/eigen2/main.h
+./test/eigen2/eigen2_miscmatrices.cpp
+./test/eigen2/eigen2_product_large.cpp
+./test/eigen2/eigen2_first_aligned.cpp
+./test/eigen2/eigen2_cholesky.cpp
+./test/eigen2/eigen2_determinant.cpp
+./test/eigen2/eigen2_sum.cpp
+./test/eigen2/eigen2_inverse.cpp
+./test/eigen2/eigen2_regression.cpp
+./test/eigen2/eigen2_product_small.cpp
+./test/eigen2/eigen2_qtvector.cpp
+./test/eigen2/eigen2_sparse_vector.cpp
+./test/eigen2/product.h
+./test/eigen2/eigen2_sparse_basic.cpp
+./test/eigen2/eigen2_bug_132.cpp
+./test/array.cpp
+./test/product_syrk.cpp
+./test/commainitializer.cpp
+./test/conservative_resize.cpp
+./test/qr_colpivoting.cpp
+./test/nullary.cpp
+./test/bandmatrix.cpp
+./test/pastix_support.cpp
+./test/product.h
+./test/block.cpp
+./test/vectorization_logic.cpp
+./test/jacobi.cpp
+./test/diagonal.cpp
+./test/schur_complex.cpp
+./test/sizeoverflow.cpp
+./bench/BenchTimer.h
+./bench/benchFFT.cpp
+./bench/eig33.cpp
+./bench/spbench/spbenchsolver.h
+./bench/spbench/spbenchstyle.h
+./lapack/complex_double.cpp
+./lapack/cholesky.cpp
+./lapack/lapack_common.h
+./lapack/eigenvalues.cpp
+./lapack/single.cpp
+./lapack/lu.cpp
+./lapack/complex_single.cpp
+./lapack/double.cpp
+./demos/mix_eigen_and_c/binary_library.cpp
+./demos/mix_eigen_and_c/binary_library.h
+./demos/mix_eigen_and_c/example.c
+./demos/mandelbrot/mandelbrot.cpp
+./demos/mandelbrot/mandelbrot.h
+./demos/opengl/icosphere.cpp
+./demos/opengl/icosphere.h
+./demos/opengl/camera.cpp
+./demos/opengl/quaternion_demo.h
+./demos/opengl/camera.h
+./demos/opengl/trackball.h
+./demos/opengl/gpuhelper.h
+./demos/opengl/trackball.cpp
+./demos/opengl/gpuhelper.cpp
+./demos/opengl/quaternion_demo.cpp
+./debug/gdb/printers.py
+./unsupported/test/minres.cpp
+./unsupported/test/openglsupport.cpp
+./unsupported/test/jacobisvd.cpp
+./unsupported/test/dgmres.cpp
+./unsupported/test/matrix_square_root.cpp
+./unsupported/test/bdcsvd.cpp
+./unsupported/test/matrix_exponential.cpp
+./unsupported/test/forward_adolc.cpp
+./unsupported/test/polynomialsolver.cpp
+./unsupported/test/matrix_function.cpp
+./unsupported/test/sparse_extra.cpp
+./unsupported/test/matrix_functions.h
+./unsupported/test/svd_common.h
+./unsupported/test/FFTW.cpp
+./unsupported/test/alignedvector3.cpp
+./unsupported/test/autodiff.cpp
+./unsupported/test/gmres.cpp
+./unsupported/test/BVH.cpp
+./unsupported/test/levenberg_marquardt.cpp
+./unsupported/test/matrix_power.cpp
+./unsupported/test/kronecker_product.cpp
+./unsupported/test/splines.cpp
+./unsupported/test/polynomialutils.cpp
+./unsupported/bench/bench_svd.cpp
+./unsupported/Eigen/IterativeSolvers
+./unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
+./unsupported/Eigen/src/IterativeSolvers/GMRES.h
+./unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
+./unsupported/Eigen/src/IterativeSolvers/Scaling.h
+./unsupported/Eigen/src/IterativeSolvers/MINRES.h
+./unsupported/Eigen/src/SparseExtra/RandomSetter.h
+./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+./unsupported/Eigen/src/SparseExtra/MarketIO.h
+./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
+./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+./unsupported/Eigen/src/BVH/BVAlgorithms.h
+./unsupported/Eigen/src/BVH/KdBVH.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
+./unsupported/Eigen/src/Splines/Spline.h
+./unsupported/Eigen/src/Splines/SplineFitting.h
+./unsupported/Eigen/src/Splines/SplineFwd.h
+./unsupported/Eigen/src/SVD/JacobiSVD.h
+./unsupported/Eigen/src/SVD/BDCSVD.h
+./unsupported/Eigen/src/SVD/SVDBase.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+./unsupported/Eigen/src/MatrixFunctions/StemFunction.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
+./unsupported/Eigen/src/MoreVectorization/MathFunctions.h
+./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+./unsupported/Eigen/src/FFT/ei_fftw_impl.h
+./unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+./unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+./unsupported/Eigen/src/Polynomials/Companion.h
+./unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
+./unsupported/Eigen/src/Skyline/SkylineProduct.h
+./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+./unsupported/Eigen/src/Skyline/SkylineStorage.h
+./unsupported/Eigen/src/Skyline/SkylineUtil.h
+./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+./unsupported/Eigen/src/Skyline/SkylineMatrix.h
+./unsupported/Eigen/SparseExtra
+./unsupported/Eigen/AdolcForward
+./unsupported/Eigen/KroneckerProduct
+./unsupported/Eigen/NonLinearOptimization
+./unsupported/Eigen/BVH
+./unsupported/Eigen/OpenGLSupport
+./unsupported/Eigen/ArpackSupport
+./unsupported/Eigen/AutoDiff
+./unsupported/Eigen/Splines
+./unsupported/Eigen/MPRealSupport
+./unsupported/Eigen/MatrixFunctions
+./unsupported/Eigen/MoreVectorization
+./unsupported/Eigen/LevenbergMarquardt
+./unsupported/Eigen/AlignedVector3
+./unsupported/Eigen/FFT
+./unsupported/Eigen/Polynomials
+./unsupported/Eigen/NumericalDiff
+./unsupported/Eigen/Skyline
+./COPYING.README
+./COPYING.README
+./LICENSE
+./LICENSE
+./LICENSE
+./Eigen/Eigen2Support
+./Eigen/src/Eigen2Support/VectorBlock.h
+./Eigen/src/Eigen2Support/Cwise.h
+./Eigen/src/Eigen2Support/Minor.h
+./Eigen/src/Eigen2Support/Lazy.h
+./Eigen/src/Eigen2Support/Memory.h
+./Eigen/src/Eigen2Support/MathFunctions.h
+./Eigen/src/Eigen2Support/Geometry/AlignedBox.h
+./Eigen/src/Eigen2Support/Geometry/Hyperplane.h
+./Eigen/src/Eigen2Support/Geometry/Quaternion.h
+./Eigen/src/Eigen2Support/Geometry/Rotation2D.h
+./Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
+./Eigen/src/Eigen2Support/Geometry/RotationBase.h
+./Eigen/src/Eigen2Support/Geometry/Translation.h
+./Eigen/src/Eigen2Support/Geometry/Scaling.h
+./Eigen/src/Eigen2Support/Geometry/AngleAxis.h
+./Eigen/src/Eigen2Support/Geometry/Transform.h
+./Eigen/src/Eigen2Support/TriangularSolver.h
+./Eigen/src/Eigen2Support/LU.h
+./Eigen/src/Eigen2Support/QR.h
+./Eigen/src/Eigen2Support/SVD.h
+./Eigen/src/Eigen2Support/Meta.h
+./Eigen/src/Eigen2Support/Block.h
+./Eigen/src/Eigen2Support/Macros.h
+./Eigen/src/Eigen2Support/LeastSquares.h
+./Eigen/src/Eigen2Support/CwiseOperators.h
+./Eigen/src/Jacobi/Jacobi.h
+./Eigen/src/misc/Kernel.h
+./Eigen/src/misc/SparseSolve.h
+./Eigen/src/misc/Solve.h
+./Eigen/src/misc/Image.h
+./Eigen/src/SparseCore/SparseColEtree.h
+./Eigen/src/SparseCore/SparseTranspose.h
+./Eigen/src/SparseCore/SparseUtil.h
+./Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+./Eigen/src/SparseCore/SparseDiagonalProduct.h
+./Eigen/src/SparseCore/SparseProduct.h
+./Eigen/src/SparseCore/SparseDot.h
+./Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+./Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+./Eigen/src/SparseCore/SparseBlock.h
+./Eigen/src/SparseCore/SparseDenseProduct.h
+./Eigen/src/SparseCore/CompressedStorage.h
+./Eigen/src/SparseCore/SparseMatrixBase.h
+./Eigen/src/SparseCore/MappedSparseMatrix.h
+./Eigen/src/SparseCore/SparseTriangularView.h
+./Eigen/src/SparseCore/SparseView.h
+./Eigen/src/SparseCore/SparseFuzzy.h
+./Eigen/src/SparseCore/TriangularSolver.h
+./Eigen/src/SparseCore/SparseSelfAdjointView.h
+./Eigen/src/SparseCore/SparseMatrix.h
+./Eigen/src/SparseCore/SparseVector.h
+./Eigen/src/SparseCore/AmbiVector.h
+./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+./Eigen/src/SparseCore/SparseRedux.h
+./Eigen/src/SparseCore/SparsePermutation.h
+./Eigen/src/Eigenvalues/RealSchur.h
+./Eigen/src/Eigenvalues/ComplexEigenSolver.h
+./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+./Eigen/src/Eigenvalues/ComplexSchur.h
+./Eigen/src/Eigenvalues/RealQZ.h
+./Eigen/src/Eigenvalues/EigenSolver.h
+./Eigen/src/Eigenvalues/HessenbergDecomposition.h
+./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+./Eigen/src/Eigenvalues/Tridiagonalization.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+./Eigen/src/SuperLUSupport/SuperLUSupport.h
+./Eigen/src/StlSupport/StdDeque.h
+./Eigen/src/StlSupport/StdVector.h
+./Eigen/src/StlSupport/StdList.h
+./Eigen/src/StlSupport/details.h
+./Eigen/src/SparseQR/SparseQR.h
+./Eigen/src/LU/Inverse.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/Determinant.h
+./Eigen/src/LU/PartialPivLU.h
+./Eigen/src/LU/FullPivLU.h
+./Eigen/src/UmfPackSupport/UmfPackSupport.h
+./Eigen/src/OrderingMethods/Ordering.h
+./Eigen/src/OrderingMethods/Eigen_Colamd.h
+./Eigen/src/QR/HouseholderQR.h
+./Eigen/src/QR/ColPivHouseholderQR.h
+./Eigen/src/QR/FullPivHouseholderQR.h
+./Eigen/src/SVD/JacobiSVD.h
+./Eigen/src/SVD/UpperBidiagonalization.h
+./Eigen/src/Geometry/OrthoMethods.h
+./Eigen/src/Geometry/AlignedBox.h
+./Eigen/src/Geometry/Hyperplane.h
+./Eigen/src/Geometry/Quaternion.h
+./Eigen/src/Geometry/EulerAngles.h
+./Eigen/src/Geometry/Rotation2D.h
+./Eigen/src/Geometry/ParametrizedLine.h
+./Eigen/src/Geometry/RotationBase.h
+./Eigen/src/Geometry/arch/Geometry_SSE.h
+./Eigen/src/Geometry/Umeyama.h
+./Eigen/src/Geometry/Homogeneous.h
+./Eigen/src/Geometry/Translation.h
+./Eigen/src/Geometry/Scaling.h
+./Eigen/src/Geometry/AngleAxis.h
+./Eigen/src/Geometry/Transform.h
+./Eigen/src/plugins/BlockMethods.h
+./Eigen/src/plugins/CommonCwiseUnaryOps.h
+./Eigen/src/plugins/CommonCwiseBinaryOps.h
+./Eigen/src/plugins/MatrixCwiseUnaryOps.h
+./Eigen/src/plugins/MatrixCwiseBinaryOps.h
+./Eigen/src/Householder/Householder.h
+./Eigen/src/Householder/HouseholderSequence.h
+./Eigen/src/Householder/BlockHouseholder.h
+./Eigen/src/Core/VectorBlock.h
+./Eigen/src/Core/Matrix.h
+./Eigen/src/Core/Ref.h
+./Eigen/src/Core/SelfAdjointView.h
+./Eigen/src/Core/MathFunctions.h
+./Eigen/src/Core/GlobalFunctions.h
+./Eigen/src/Core/MapBase.h
+./Eigen/src/Core/EigenBase.h
+./Eigen/src/Core/GenericPacketMath.h
+./Eigen/src/Core/NestByValue.h
+./Eigen/src/Core/CwiseUnaryOp.h
+./Eigen/src/Core/SolveTriangular.h
+./Eigen/src/Core/Fuzzy.h
+./Eigen/src/Core/Visitor.h
+./Eigen/src/Core/Map.h
+./Eigen/src/Core/NoAlias.h
+./Eigen/src/Core/Diagonal.h
+./Eigen/src/Core/StableNorm.h
+./Eigen/src/Core/CoreIterators.h
+./Eigen/src/Core/products/Parallelizer.h
+./Eigen/src/Core/products/SelfadjointMatrixVector.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+./Eigen/src/Core/products/TriangularSolverMatrix.h
+./Eigen/src/Core/products/GeneralMatrixMatrix.h
+./Eigen/src/Core/products/SelfadjointProduct.h
+./Eigen/src/Core/products/CoeffBasedProduct.h
+./Eigen/src/Core/products/TriangularMatrixVector.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+./Eigen/src/Core/products/TriangularSolverVector.h
+./Eigen/src/Core/products/SelfadjointRank2Update.h
+./Eigen/src/Core/products/GeneralBlockPanelKernel.h
+./Eigen/src/Core/products/GeneralMatrixVector.h
+./Eigen/src/Core/products/TriangularMatrixMatrix.h
+./Eigen/src/Core/Reverse.h
+./Eigen/src/Core/BooleanRedux.h
+./Eigen/src/Core/Replicate.h
+./Eigen/src/Core/arch/AltiVec/PacketMath.h
+./Eigen/src/Core/arch/AltiVec/Complex.h
+./Eigen/src/Core/arch/SSE/PacketMath.h
+./Eigen/src/Core/arch/SSE/Complex.h
+./Eigen/src/Core/arch/SSE/MathFunctions.h
+./Eigen/src/Core/arch/NEON/PacketMath.h
+./Eigen/src/Core/arch/NEON/Complex.h
+./Eigen/src/Core/arch/Default/Settings.h
+./Eigen/src/Core/CwiseUnaryView.h
+./Eigen/src/Core/Array.h
+./Eigen/src/Core/ArrayWrapper.h
+./Eigen/src/Core/Swap.h
+./Eigen/src/Core/Transpositions.h
+./Eigen/src/Core/Random.h
+./Eigen/src/Core/IO.h
+./Eigen/src/Core/SelfCwiseBinaryOp.h
+./Eigen/src/Core/VectorwiseOp.h
+./Eigen/src/Core/Select.h
+./Eigen/src/Core/ArrayBase.h
+./Eigen/src/Core/DenseCoeffsBase.h
+./Eigen/src/Core/DiagonalProduct.h
+./Eigen/src/Core/Assign.h
+./Eigen/src/Core/Redux.h
+./Eigen/src/Core/ForceAlignedAccess.h
+./Eigen/src/Core/BandMatrix.h
+./Eigen/src/Core/PlainObjectBase.h
+./Eigen/src/Core/DenseBase.h
+./Eigen/src/Core/Flagged.h
+./Eigen/src/Core/CwiseBinaryOp.h
+./Eigen/src/Core/ProductBase.h
+./Eigen/src/Core/TriangularMatrix.h
+./Eigen/src/Core/Transpose.h
+./Eigen/src/Core/DiagonalMatrix.h
+./Eigen/src/Core/Dot.h
+./Eigen/src/Core/Functors.h
+./Eigen/src/Core/PermutationMatrix.h
+./Eigen/src/Core/NumTraits.h
+./Eigen/src/Core/MatrixBase.h
+./Eigen/src/Core/DenseStorage.h
+./Eigen/src/Core/util/Memory.h
+./Eigen/src/Core/util/StaticAssert.h
+./Eigen/src/Core/util/BlasUtil.h
+./Eigen/src/Core/util/MatrixMapper.h
+./Eigen/src/Core/util/XprHelper.h
+./Eigen/src/Core/util/ForwardDeclarations.h
+./Eigen/src/Core/util/Meta.h
+./Eigen/src/Core/util/Macros.h
+./Eigen/src/Core/util/Constants.h
+./Eigen/src/Core/CwiseNullaryOp.h
+./Eigen/src/Core/Block.h
+./Eigen/src/Core/GeneralProduct.h
+./Eigen/src/Core/CommaInitializer.h
+./Eigen/src/Core/ReturnByValue.h
+./Eigen/src/Core/Stride.h
+./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+./Eigen/src/SparseLU/SparseLU_column_dfs.h
+./Eigen/src/SparseLU/SparseLU_panel_dfs.h
+./Eigen/src/SparseLU/SparseLU_relax_snode.h
+./Eigen/src/SparseLU/SparseLU_panel_bmod.h
+./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+./Eigen/src/SparseLU/SparseLU_Utils.h
+./Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+./Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+./Eigen/src/SparseLU/SparseLU_pivotL.h
+./Eigen/src/SparseLU/SparseLU_Memory.h
+./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+./Eigen/src/SparseLU/SparseLUImpl.h
+./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+./Eigen/src/SparseLU/SparseLU_Structs.h
+./Eigen/src/SparseLU/SparseLU.h
+./Eigen/src/SparseLU/SparseLU_column_bmod.h
+./Eigen/src/SparseLU/SparseLU_pruneL.h
+./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+./Eigen/src/SparseCholesky/SimplicialCholesky.h
+./Eigen/src/Cholesky/LDLT.h
+./Eigen/src/Cholesky/LLT.h
+./Eigen/src/CholmodSupport/CholmodSupport.h
+./Eigen/src/PaStiXSupport/PaStiXSupport.h
+./Eigen/src/MetisSupport/MetisSupport.h
+./Eigen/StdVector
+./Eigen/Core
+./Eigen/SparseLU
+./Eigen/StdList
+./Eigen/StdDeque
+./Eigen/SparseCholesky
+./scripts/relicense.py
+./scripts/relicense.py
+./blas/BandTriangularSolver.h
+./blas/PackedTriangularMatrixVector.h
+./blas/complex_double.cpp
+./blas/level2_real_impl.h
+./blas/level1_cplx_impl.h
+./blas/level1_impl.h
+./blas/level1_real_impl.h
+./blas/level3_impl.h
+./blas/single.cpp
+./blas/level2_cplx_impl.h
+./blas/PackedSelfadjointProduct.h
+./blas/Rank2Update.h
+./blas/complex_single.cpp
+./blas/PackedTriangularSolverVector.h
+./blas/double.cpp
+./blas/common.h
+./blas/level2_impl.h
+./blas/GeneralRank1Update.h
+
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+    means
+
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in 
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
+
+----------------------------------------------------------------------
+Following applies to:
+./doc/UsingIntelMKL.dox
+./doc/UsingIntelMKL.dox
+./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+./Eigen/src/Eigenvalues/RealSchur_MKL.h
+./Eigen/src/Eigenvalues/RealSchur_MKL.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/PartialPivLU_MKL.h
+./Eigen/src/LU/PartialPivLU_MKL.h
+./Eigen/src/QR/HouseholderQR_MKL.h
+./Eigen/src/QR/HouseholderQR_MKL.h
+./Eigen/src/QR/ColPivHouseholderQR_MKL.h
+./Eigen/src/QR/ColPivHouseholderQR_MKL.h
+./Eigen/src/SVD/JacobiSVD_MKL.h
+./Eigen/src/SVD/JacobiSVD_MKL.h
+./Eigen/src/PardisoSupport/PardisoSupport.h
+./Eigen/src/PardisoSupport/PardisoSupport.h
+./Eigen/src/Core/Assign_MKL.h
+./Eigen/src/Core/Assign_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+./Eigen/src/Core/util/MKL_support.h
+./Eigen/src/Core/util/MKL_support.h
+./Eigen/src/Cholesky/LLT_MKL.h
+./Eigen/src/Cholesky/LLT_MKL.h
+
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.  *
+   Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.  * Neither the name of Intel Corporation nor the
+   names of its contributors may be used to endorse or promote
+   products derived from this software without specific prior written
+   permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+----------------------------------------------------------------------
+Following applies to:
+  everything under ./bench/btl
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds
+of works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal,
+family, or household purposes, or (2) anything designed or sold for
+incorporation into a dwelling.  In determining whether a product is a
+consumer product, doubtful cases shall be resolved in favor of
+coverage.  For a particular product received by a particular user,
+"normally used" refers to a typical or common use of that class of
+product, regardless of the status of the particular user or of the way
+in which the particular user actually uses, or expects or is expected
+to use, the product.  A product is a consumer product regardless of
+whether the product has substantial commercial, industrial or
+non-consumer uses, unless such uses represent the only significant
+mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to
+install and execute modified versions of a covered work in that User
+Product from a modified version of its Corresponding Source.  The
+information must suffice to ensure that the continued functioning of
+the modified object code is in no case prevented or interfered with
+solely because modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include
+a requirement to continue to provide support service, warranty, or
+updates for a work that has been modified or installed by the
+recipient, or for the User Product in which it has been modified or
+installed.  Access to a network may be denied when the modification
+itself materially and adversely affects the operation of the network
+or violates the rules and protocols for communication across the
+network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material
+you add to a covered work, you may (if authorized by the copyright
+holders of that material) supplement the terms of this License with
+terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions
+    of it) with contractual assumptions of liability to the recipient,
+    for any liability that these contractual assumptions directly
+    impose on those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement
+or otherwise) that contradict the conditions of this License, they do
+not excuse you from the conditions of this License.  If you cannot
+convey a covered work so as to satisfy simultaneously your obligations
+under this License and any other pertinent obligations, then as a
+consequence you may not convey it at all.  For example, if you agree
+to terms that obligate you to collect a royalty for further conveying
+from those to whom you convey the Program, the only way you could
+satisfy both those terms and this License would be to refrain entirely
+from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU General Public License from time to time.  Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT
+WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND
+PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE PROGRAM PROVE
+DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
+CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES
+AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
+DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
+DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
+(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
+INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
+THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER
+OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these
+terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it
+    does.>
+    Copyright (C) <year> <name of author>
+
+    This program is free software: you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation, either version 3 of the
+    License, or (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see
+    <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program> Copyright (C) <year> <name of author> This program comes
+    with ABSOLUTELY NO WARRANTY; for details type `show w'.  This is
+    free software, and you are welcome to redistribute it under
+    certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the
+appropriate parts of the General Public License.  Of course, your
+program's commands might be different; for a GUI interface, you would
+use an "about box".
+
+  You should also get your employer (if you work as a programmer) or
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  For more information on this, and how to apply and follow
+the GNU GPL, see <http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your
+program into proprietary programs.  If your program is a subroutine
+library, you may consider it more useful to permit linking proprietary
+applications with the library.  If this is what you want to do, use
+the GNU Lesser General Public License instead of this License.  But
+first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
+
+----------------------------------------------------------------------
+Following applies to:
+./test/metis_support.cpp
+./test/sparselu.cpp
+./unsupported/test/mpreal/mpreal.h
+./unsupported/Eigen/src/IterativeSolvers/IterationController.h
+./unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+./unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+./Eigen/src/OrderingMethods/Amd.h
+./Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions. 
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the
+GNU General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this
+   license document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this
+   license document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of
+       this License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version. 
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
+
+
+----------------------------------------------------------------------
+Following applies to:
+./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+
+Minpack Copyright Notice (1999) University of Chicago.  All rights
+reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+   "This product includes software developed by the
+   University of Chicago, as Operator of Argonne National
+   Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.
diff --git a/third_party/eigen3/eigen.BUILD b/third_party/eigen3/eigen.BUILD
new file mode 100644
index 00000000..0bde8b8a
--- /dev/null
+++ b/third_party/eigen3/eigen.BUILD
@@ -0,0 +1,71 @@
+# Description:
+#   Eigen is a C++ template library for linear algebra: vectors,
+#   matrices, and related algorithms.
+# This file is mostly stolen from tensorflow.
+
+licenses([
+    # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
+    #       We've taken special care to not reference any restricted code.
+    "reciprocal",  # MPL2
+    "notice",  # Portions BSD
+])
+
+exports_files(["COPYING.MPL2"])
+
+# License-restricted (i.e. not reciprocal or notice) files inside Eigen/...
+EIGEN_RESTRICTED_FILES = [
+    "Eigen/src/OrderingMethods/Amd.h",
+    "Eigen/src/SparseCholesky/**",
+]
+
+# Notable transitive dependencies of restricted files inside Eigen/...
+EIGEN_RESTRICTED_DEPS = [
+    "Eigen/Eigen",
+    "Eigen/IterativeLinearSolvers",
+    "Eigen/MetisSupport",
+    "Eigen/Sparse",
+    "Eigen/SparseCholesky",
+    "Eigen/SparseLU",
+]
+
+# Note: unsupported/Eigen is unsupported and might go away at any time.
+EIGEN_FILES = [
+    "Eigen/**",
+    "unsupported/Eigen/CXX11/**",
+    "unsupported/Eigen/FFT",
+    "unsupported/Eigen/KroneckerProduct",
+    "unsupported/Eigen/src/FFT/**",
+    "unsupported/Eigen/src/KroneckerProduct/**",
+    "unsupported/Eigen/MatrixFunctions",
+    "unsupported/Eigen/SpecialFunctions",
+    "unsupported/Eigen/src/SpecialFunctions/**",
+]
+
+# List of files picked up by glob but actually part of another target.
+EIGEN_EXCLUDE_FILES = [
+    "Eigen/src/Core/arch/AVX/PacketMathGoogleTest.cc",
+]
+
+# Files known to be under MPL2 license.
+EIGEN_MPL2_HEADER_FILES = glob(
+    EIGEN_FILES,
+    exclude = EIGEN_EXCLUDE_FILES +
+              EIGEN_RESTRICTED_FILES +
+              EIGEN_RESTRICTED_DEPS + [
+        # Guarantees any file missed by excludes above will not compile.
+        "Eigen/src/Core/util/NonMPL2.h",
+        "Eigen/**/CMakeLists.txt",
+    ],
+)
+
+cc_library(
+    name = "eigen",
+    hdrs = EIGEN_MPL2_HEADER_FILES,
+    defines = [
+        # This define (mostly) guarantees we don't link any problematic
+        # code. We use it, but we do not rely on it, as evidenced above.
+        "EIGEN_MPL2_ONLY",
+    ],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Core b/third_party/eigen3/unsupported/Eigen/CXX11/Core
new file mode 100644
index 00000000..1b369071
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Core
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_CORE_MODULE
+#define EIGEN_CXX11_CORE_MODULE
+
+#include <Eigen/Core>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_Core_Module C++11 Core Module
+  *
+  * This module provides common core features for all modules that
+  * explicitly depend on C++11. Currently, this is only the Tensor
+  * module. Note that at this stage, you should not need to include
+  * this module directly.
+  *
+  * It also provides a limited fallback for compilers that don't support
+  * CXX11 yet, such as nvcc.
+  *
+  * \code
+  * #include <Eigen/CXX11/Core>
+  * \endcode
+  */
+
+// Only a subset of cxx11 is allowed at Google, so we default to emulate the
+// cxx11 functionality that we need.
+#include "src/Core/util/FixedSizeVector.h"
+#if 1
+#include <vector>
+#include "src/Core/util/EmulateCXX11Meta.h"
+#else
+#include "src/Core/util/CXX11Workarounds.h"
+#include "src/Core/util/CXX11Meta.h"
+#endif
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_CORE_MODULE
+
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
new file mode 100644
index 00000000..eb604d38
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MODULE
+#define EIGEN_CXX11_FIXED_POINT_MODULE
+
+#include <Eigen/Core>
+#include <stdint.h>
+
+/** \defgroup CXX11_FixedPoint_Module Fixed Point Module
+  *
+  * This module provides common core features for all modules that
+  * explicitly depend on C++11. Currently, this is only the Tensor
+  * module. Note that at this stage, you should not need to include
+  * this module directly.
+  *
+  * It also provides a limited fallback for compilers that don't support
+  * CXX11 yet, such as nvcc.
+  *
+  * \code
+  * #include <Eigen/CXX11/FixedPoint>
+  * \endcode
+  */
+
+#include "src/FixedPoint/FixedPointTypes.h"
+
+// Use optimized implementations whenever available
+#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
+#include "src/FixedPoint/PacketMathAVX512.h"
+#include "src/FixedPoint/TypeCastingAVX512.h"
+
+#elif defined EIGEN_VECTORIZE_AVX2
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+#include "src/FixedPoint/PacketMathAVX2.h"
+#include "src/FixedPoint/MatMatProductAVX2.h"
+#include "src/FixedPoint/TypeCastingAVX2.h"
+
+#elif defined EIGEN_VECTORIZE_NEON
+#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+#include "src/FixedPoint/MatMatProductNEON.h"
+#endif
+
+// Use the default implementation when no optimized code is available
+#include "src/FixedPoint/MatMatProduct.h"
+#include "src/FixedPoint/MatVecProduct.h"
+
+
+#endif // EIGEN_CXX11_FIXED_POINT_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
new file mode 100644
index 00000000..7741b68d
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE
+#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module
+  *
+  * This module provides an efficient implementation of the common primitives
+  * used by neural networks.
+  * The primitives are  built on top of the tensor library.
+  *
+  * \code
+  * #include <Eigen/CXX11/NeuralNetworks>
+  * \endcode
+  */
+
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h"
+#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h"
+
+#endif  // EIGEN_CXX11_NEURAL_NETWORKS_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
new file mode 100644
index 00000000..861a87b6
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -0,0 +1,15 @@
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#ifdef _WIN32
+#ifndef SLEEP_FUNC_HEADER_GUARD
+#define SLEEP_FUNC_HEADER_GUARD
+inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
+#endif
+
+// On Windows, Eigen will include Windows.h, which defines various
+// macros that conflict with TensorFlow symbols. Undefine them here to
+// prevent clashes.
+#undef DeleteFile
+#undef ERROR
+#undef LoadLibrary
+#endif  // _WIN32
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool b/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
new file mode 100644
index 00000000..d2639af4
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
@@ -0,0 +1 @@
+#include "unsupported/Eigen/CXX11/ThreadPool"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
new file mode 100644
index 00000000..6b625abc
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -0,0 +1,342 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H
+#define EIGEN_CXX11_FIXED_POINT_TYPES_H
+
+#include <cmath>
+#include <iostream>
+
+namespace Eigen {
+
+// The mantissa part of the fixed point representation. See
+// go/tensorfixedpoint for details
+struct QInt8;
+struct QUInt8;
+struct QInt16;
+struct QUInt16;
+struct QInt32;
+
+template <>
+struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
+template <>
+struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
+template <>
+struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
+template <>
+struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
+template <>
+struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
+
+namespace internal {
+template <>
+struct scalar_product_traits<QInt32, double> {
+  enum {
+    // Cost = NumTraits<T>::MulCost,
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+}
+
+// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
+// the compiler from silently type cast the mantissa into a bigger or a smaller
+// representation.
+struct QInt8 {
+  QInt8() {}
+  QInt8(const int8_t v) : value(v) {}
+  QInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  int8_t value;
+};
+
+struct QUInt8 {
+  QUInt8() {}
+  QUInt8(const uint8_t v) : value(v) {}
+  QUInt8(const QInt32 v);
+
+  operator int() const { return static_cast<int>(value); }
+
+  uint8_t value;
+};
+
+struct QInt16 {
+  QInt16() {}
+  QInt16(const int16_t v) : value(v) {}
+  QInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  int16_t value;
+};
+
+struct QUInt16 {
+  QUInt16() {}
+  QUInt16(const uint16_t v) : value(v) {}
+  QUInt16(const QInt32 v);
+  operator int() const { return static_cast<int>(value); }
+
+  uint16_t value;
+};
+
+struct QInt32 {
+  QInt32() {}
+  QInt32(const int8_t v) : value(v) {}
+  QInt32(const int32_t v) : value(v) {}
+  QInt32(const uint32_t v) : value(static_cast<int32_t>(v)) {}
+  QInt32(const QInt8 v) : value(v.value) {}
+  QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
+#ifdef EIGEN_MAKING_DOCS
+  // Workaround to fix build on PPC.
+  QInt32(unsigned long v) : value(v) {}
+#endif
+
+  operator float() const { return static_cast<float>(value); }
+
+  int32_t value;
+};
+
+EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
+    : value(v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value)) {}
+EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
+    : value(v.value > 255 ? 255 : (v.value < 0 ? 0 : v.value)) {}
+EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
+    : value(v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value)) {}
+EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
+    : value(v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value)) {}
+
+// Basic widening 8-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Basic widening 16-bit operations: This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
+  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
+}
+
+// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value + static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) + b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value - static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) - b.value);
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
+  return QInt32(a.value * static_cast<int32_t>(b.value));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
+  return QInt32(static_cast<int32_t>(a.value) * b.value);
+}
+
+// Basic arithmetic operations on QInt32, which behaves like a int32_t.
+EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
+  return a.value + b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
+  return a.value - b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
+  return a.value * b.value;
+}
+EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
+  return a.value / b.value;
+}
+EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
+  a.value += b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
+  a.value -= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
+  a.value *= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
+  a.value /= b.value;
+  return a;
+}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
+  return -a.value;
+}
+
+// Scaling QInt32 by double. We do the arithmetic in double because
+// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
+// accuracy by discarding up to 7 (least significant) bits.
+EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
+  return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+}
+EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
+  return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
+}
+EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
+  a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
+  return a;
+}
+
+// Comparisons
+EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
+  return a.value == b.value;
+}
+EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
+  return a.value == b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
+  return a.value < b.value;
+}
+EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
+  return a.value < b.value;
+}
+
+EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
+  return a.value > b.value;
+}
+EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
+  return a.value > b.value;
+}
+
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
+  os << static_cast<int>(a.value);
+  return os;
+}
+EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
+  os << a.value;
+  return os;
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_FIXED_POINT_TYPES_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
new file mode 100644
index 00000000..4d0dca07
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -0,0 +1,255 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
+// overflows
+template<> struct scalar_product_traits<QInt8, QInt8>
+{
+  enum {
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+
+// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
+// to prevent overflows
+template<> struct scalar_product_traits<QInt8, QUInt8>
+{
+  enum {
+    Defined = 1
+  };
+  typedef QInt32 ReturnType;
+};
+
+// Description of the product implementation. It's pretty simple now since
+// nothing is vectorized yet.
+// This definition tackle the case where both lhs and rhs are encoded using
+// signed 8bit integers
+#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
+
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 8bit Mat-Mat product itself.
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+// This definition tackle the case where the lhs is encoded using signed 8bit
+// integers and the rhs using unsigned 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+// This definition tackle the case where the khs is encoded using unsigned 8bit
+// integers and the rhs using signed 8bit integers.
+#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QUInt8 LhsScalar;
+  typedef QInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+
+// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
new file mode 100644
index 00000000..6b4b0edc
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -0,0 +1,1754 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Matthew Sarett <msarett@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+
+namespace Eigen {
+namespace internal {
+
+// AVX2 optimized implementation of Mat-Mat product.
+// LHS is encoded using signed 8-bit integers.
+// RHS is encoded using unsigned 8-bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+// Define quantized traits
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // Define register blocking scheme.
+    nr = 32,
+    mr = 32,
+    kr = 8,
+    // Ignore progress tracking per loop iteration.
+    LhsProgress = -1,
+    RhsProgress = -1
+  };
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template<typename Index,
+         typename LeftTensor,
+         typename left_nocontract_t, typename left_contract_t,
+         bool left_inner_dim_contiguous, bool left_inner_dim_reordered, int LeftAlignment,
+         typename RightTensor,
+         typename right_nocontract_t, typename right_contract_t,
+         bool right_inner_dim_contiguous, bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
+class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor, right_nocontract_t, right_contract_t, 32, right_inner_dim_contiguous, right_inner_dim_reordered, RightAlignment>, Index, ShardingType> {
+ public:
+
+  typedef QInt8  LhsScalar;
+  typedef QUInt8 RhsScalar;
+
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+      kc_(k), mc_(m), nc_(n)
+  {
+    eigen_assert(m % 32 == 0);
+    eigen_assert(k % 32 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+
+    if (ShardingType == ShardByCol) {
+      eigen_assert(n % 32 == 0);
+      nc_ = (((n / num_threads) + 31) / 32) * 32;
+    }
+    else {
+      eigen_assert(n % 32 == 0 || n == 1);
+      // Special case to avoid breaking the unimplemented matrix-vector case
+      if (n == 1) {
+        nc_ = 32;
+      }
+      mc_ = (((m / num_threads) + 31) / 32) * 32;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+
+ private:
+  Index kc_;
+  Index mc_;
+  Index nc_;
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
+// multiples of 32.
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt8, QUInt8> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 31) / 32) * 32;
+    this->m_nc = ((cols + 31) / 32) * 32;
+    this->m_kc = ((depth + 31) / 32) * 32;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+// Alternate templates for any input sizes
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+struct gemm_pack_lhs_any;
+template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()
+      (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+struct gemm_pack_rhs_any;
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()
+      (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+struct gebp_kernel_any;
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+// Alternate implementations for any input sizes
+template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Get even multiples of the dimensions
+  Index rows_32 = (rows / 32) * 32;
+  Index depth_8 = (depth / 8) * 8;
+
+  // Get padding for when depth is not a multiple of 32
+  int padding = 0;
+  if (depth % 32 != 0) {
+    int depth_32 = (depth / 32) * 32;
+    int extra_depth = depth - depth_32;
+    int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
+    padding = 32 - extra_depth_8;
+  }
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows_32; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.loadPacket(m, k);
+      __m256i L_B = lhs.loadPacket(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.loadPacket(m, k + 2);
+      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.loadPacket(m, k + 4);
+      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.loadPacket(m, k + 6);
+      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      switch (depth - depth_8) {
+      case 1:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 2:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 3:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 4:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 5:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 6:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = lhs.loadPacket(m, depth_8 + 5);
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        break;
+      case 7:
+        L_A = lhs.loadPacket(m, depth_8);
+        L_B = lhs.loadPacket(m, depth_8 + 1);
+        L_C = lhs.loadPacket(m, depth_8 + 2);
+        L_D = lhs.loadPacket(m, depth_8 + 3);
+        L_E = lhs.loadPacket(m, depth_8 + 4);
+        L_F = lhs.loadPacket(m, depth_8 + 5);
+        L_G = lhs.loadPacket(m, depth_8 + 6);
+        L_H = _mm256_setzero_si256();
+        break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+    blockA_256 += padding;
+  }
+
+  // Finish the m dimension, padding with zeros
+  if (rows_32 < rows) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth_8; k += 8) {
+      // Load vectors
+      __m256i L_A = _mm256_setzero_si256();
+      __m256i L_B = _mm256_setzero_si256();
+      __m256i L_C = _mm256_setzero_si256();
+      __m256i L_D = _mm256_setzero_si256();
+      __m256i L_E = _mm256_setzero_si256();
+      __m256i L_F = _mm256_setzero_si256();
+      __m256i L_G = _mm256_setzero_si256();
+      __m256i L_H = _mm256_setzero_si256();
+      for (Index m = 0; m < rows - rows_32; m++) {
+        QInt8* ptr = (QInt8*) &L_A;
+        ptr[m] = lhs(rows_32 + m, k);
+        ptr = (QInt8*) &L_B;
+        ptr[m] = lhs(rows_32 + m, k + 1);
+        ptr = (QInt8*) &L_C;
+        ptr[m] = lhs(rows_32 + m, k + 2);
+        ptr = (QInt8*) &L_D;
+        ptr[m] = lhs(rows_32 + m, k + 3);
+        ptr = (QInt8*) &L_E;
+        ptr[m] = lhs(rows_32 + m, k + 4);
+        ptr = (QInt8*) &L_F;
+        ptr[m] = lhs(rows_32 + m, k + 5);
+        ptr = (QInt8*) &L_G;
+        ptr[m] = lhs(rows_32 + m, k + 6);
+        ptr = (QInt8*) &L_H;
+        ptr[m] = lhs(rows_32 + m, k + 7);
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+
+    // Finish the k dimension, padding with zeros
+    if (depth_8 < depth) {
+      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
+      QInt8* ptr;
+      switch (depth - depth_8) {
+      case 1:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          QInt8* ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+        }
+        break;
+      case 2:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+        }
+        break;
+      case 3:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+        }
+        break;
+      case 4:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+        }
+        break;
+      case 5:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+        }
+        break;
+      case 6:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          ptr = (QInt8*) &L_F;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+        }
+        break;
+      case 7:
+        L_A = _mm256_setzero_si256();
+        L_B = _mm256_setzero_si256();
+        L_C = _mm256_setzero_si256();
+        L_D = _mm256_setzero_si256();
+        L_E = _mm256_setzero_si256();
+        L_F = _mm256_setzero_si256();
+        L_G = _mm256_setzero_si256();
+        L_H = _mm256_setzero_si256();
+        for (Index m = 0; m < rows - rows_32; m++) {
+          ptr = (QInt8*) &L_A;
+          ptr[m] = lhs(rows_32 + m, depth_8);
+          ptr = (QInt8*) &L_B;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          ptr = (QInt8*) &L_C;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          ptr = (QInt8*) &L_D;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          ptr = (QInt8*) &L_E;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          ptr = (QInt8*) &L_F;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          ptr = (QInt8*) &L_G;
+          ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+        }
+        break;
+      }
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Get even multiples of the dimensions
+  Index cols_32 = (cols / 32) * 32;
+  Index depth_32 = (depth / 32) * 32;
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A = rhs.loadPacket(k, n);
+      __m256i R_B = rhs.loadPacket(k, n + 1);
+      __m256i R_C = rhs.loadPacket(k, n + 2);
+      __m256i R_D = rhs.loadPacket(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 4);
+      R_B = rhs.loadPacket(k, n + 5);
+      R_C = rhs.loadPacket(k, n + 6);
+      R_D = rhs.loadPacket(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 8);
+      R_B = rhs.loadPacket(k, n + 9);
+      R_C = rhs.loadPacket(k, n + 10);
+      R_D = rhs.loadPacket(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 12);
+      R_B = rhs.loadPacket(k, n + 13);
+      R_C = rhs.loadPacket(k, n + 14);
+      R_D = rhs.loadPacket(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 16);
+      R_B = rhs.loadPacket(k, n + 17);
+      R_C = rhs.loadPacket(k, n + 18);
+      R_D = rhs.loadPacket(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 20);
+      R_B = rhs.loadPacket(k, n + 21);
+      R_C = rhs.loadPacket(k, n + 22);
+      R_D = rhs.loadPacket(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 24);
+      R_B = rhs.loadPacket(k, n + 25);
+      R_C = rhs.loadPacket(k, n + 26);
+      R_D = rhs.loadPacket(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 28);
+      R_B = rhs.loadPacket(k, n + 29);
+      R_C = rhs.loadPacket(k, n + 30);
+      R_D = rhs.loadPacket(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+
+    if (depth_32 < depth) {
+      QUInt8* ptr;
+      __m256i R_A = _mm256_setzero_si256();
+      __m256i R_B = _mm256_setzero_si256();
+      __m256i R_C = _mm256_setzero_si256();
+      __m256i R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 1);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 2);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 3);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 4);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 5);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 6);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 7);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 8);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 9);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 10);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 11);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 12);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 13);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 14);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 15);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 16);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 17);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 18);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 19);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 20);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 21);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 22);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 23);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 24);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 25);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 26);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 27);
+      }
+      PACK_STEP;
+
+      R_A = _mm256_setzero_si256();
+      R_B = _mm256_setzero_si256();
+      R_C = _mm256_setzero_si256();
+      R_D = _mm256_setzero_si256();
+      for (Index k = depth_32; k < depth; k++) {
+        ptr = (QUInt8*) &R_A;
+        ptr[k - depth_32] = rhs(k, n + 28);
+        ptr = (QUInt8*) &R_B;
+        ptr[k - depth_32] = rhs(k, n + 29);
+        ptr = (QUInt8*) &R_C;
+        ptr[k - depth_32] = rhs(k, n + 30);
+        ptr = (QUInt8*) &R_D;
+        ptr[k - depth_32] = rhs(k, n + 31);
+      }
+      PACK_STEP;
+      blockB_256 += 24;
+    }
+  }
+
+  // Finish packing cols
+  if (cols_32 < cols) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth_32; k += 32) {
+      __m256i R_A, R_B, R_C, R_D;
+      Index n;
+      for (n = cols_32; n < cols; n += 4) {
+        switch (cols - n) {
+        case 1:
+          R_A = rhs.loadPacket(k, n);
+          R_B = _mm256_setzero_si256();
+          R_C = _mm256_setzero_si256();
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        case 2:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = _mm256_setzero_si256();
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        case 3:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = rhs.loadPacket(k, n + 2);
+          R_D = _mm256_setzero_si256();
+          PACK_STEP;
+          break;
+        default:
+          R_A = rhs.loadPacket(k, n);
+          R_B = rhs.loadPacket(k, n + 1);
+          R_C = rhs.loadPacket(k, n + 2);
+          R_D = rhs.loadPacket(k, n + 3);
+          PACK_STEP;
+          break;
+        }
+      }
+
+      // Increment the block pointer.
+      // We must pad if cols is not a multiple of 32.
+      blockB_256 += 32 - (n - cols_32) / 4;
+    }
+
+    if (depth_32 < depth) {
+      for (Index n = cols_32; n < cols; n += 4) {
+        QUInt8* ptr;
+        __m256i R_A = _mm256_setzero_si256();
+        __m256i R_B = _mm256_setzero_si256();
+        __m256i R_C = _mm256_setzero_si256();
+        __m256i R_D = _mm256_setzero_si256();
+        switch (cols - n) {
+        case 1:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+          }
+          PACK_STEP;
+          break;
+        case 2:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+          }
+          PACK_STEP;
+          break;
+        case 3:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+            ptr = (QUInt8*) &R_C;
+            ptr[k - depth_32] = rhs(k, n + 2);
+          }
+          PACK_STEP;
+          break;
+        default:
+          for (Index k = depth_32; k < depth; k++) {
+            ptr = (QUInt8*) &R_A;
+            ptr[k - depth_32] = rhs(k, n);
+            ptr = (QUInt8*) &R_B;
+            ptr[k - depth_32] = rhs(k, n + 1);
+            ptr = (QUInt8*) &R_C;
+            ptr[k - depth_32] = rhs(k, n + 2);
+            ptr = (QUInt8*) &R_D;
+            ptr[k - depth_32] = rhs(k, n + 3);
+          }
+          PACK_STEP;
+          break;
+        }
+      }
+    }
+  }
+#undef PACK_STEP
+}
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  Index rows_32 = ((rows + 31) / 32) * 32;
+  Index cols_32 = ((cols + 31) / 32) * 32;
+  Index depth_32 = ((depth + 31) / 32) * 32;
+
+  // Create result block
+  ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols_32; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows_32; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth_32;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth_32; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix.
+      if (m + 32 <= rows && n + 32 <= cols) {
+        Index i = 0;
+        for (Index j = n; j < n + 32; j++) {
+          LinearMapper r0 = res.getLinearMapper(m, j);
+          LinearMapper r1 = res.getLinearMapper(m + 8, j);
+          LinearMapper r2 = res.getLinearMapper(m + 16, j);
+          LinearMapper r3 = res.getLinearMapper(m + 24, j);
+          r0.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
+          r1.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+          r2.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
+          r3.storePacket(
+              0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+        }
+      }
+      else {
+        for (Index j = n; j < cols; j++) {
+          for (Index i = m; i < rows; i++) {
+            res(i, j) = blockO[(j - n) * 32 + (i - m)];
+          }
+        }
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+}
+
+// Below are the fully optimized versions that are correct only for sizes that
+// are multiple of 32.  It is about a 10% performance benefit to keep these
+// implementations separate.
+
+// Arrange a block of the left input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields output (A0 beside B0 in memory):
+// A0 B0 C0 D0
+// A1 B1 C1 D1
+// A2 B2 C2 D2
+// A3 B3 C3 D3
+// A4 B4 C4 D4
+// A5 B5 C5 D5
+// A6 B6 C6 D6
+// A7 B7 C7 D7
+// ...
+// A31 B31 C31 D31
+// E0 F0 G0 H0
+// E1 F1 G1 H1
+// E2 F2 G2 H2
+// E3 F3 G3 H3
+// E4 F4 G4 H4
+// E5 F5 G5 H5
+// E6 F6 G6 H6
+// E7 F7 G7 H7
+// ...
+//
+// Four elements of the same row are arranged contiguously because maddubs and
+// madd both perform an adjacent addition in the kernel.
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                     Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
+                                     ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
+    return lhs_pack(blockA, lhs, depth, rows, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Pack rows in sets of 32
+  for (Index m = 0; m < rows; m += 32) {
+    // Pack depth in sets of 8
+    for (Index k = 0; k < depth; k += 8) {
+      // Load vectors
+      __m256i L_A = lhs.loadPacket(m, k);
+      __m256i L_B = lhs.loadPacket(m, k + 1);
+
+      // Interleave 8-bit elements
+      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
+      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
+
+      __m256i L_C = lhs.loadPacket(m, k + 2);
+      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
+      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
+
+      // Interleave 16-bit elements
+      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
+      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
+
+      // Use permute before we store to cross 128-bit lanes
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+
+      // Complete packing for 32 x 8 block
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
+      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+      __m256i L_E = lhs.loadPacket(m, k + 4);
+      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
+      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
+      __m256i L_G = lhs.loadPacket(m, k + 6);
+      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
+      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
+      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
+      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH0);
+      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
+      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
+      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
+      _mm256_store_si256(blockA_256++, L_EH8);
+      _mm256_store_si256(blockA_256++, L_EH16);
+      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
+      _mm256_store_si256(blockA_256++, L_EH24);
+    }
+  }
+}
+
+// Arrange a block of the right input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing yields row major output (A0 beside A1 in memory):
+// A0 A1 A2 A3 A4 A5 A6 A7
+// B0 B1 B2 B3 B4 B5 B6 B7
+// ...
+//
+// At least four elements of the same col are arranged contiguously because
+// maddubs and madd both perform an adjacent addition in the kernel.  We can
+// save work by leaving 8 adjacent elements because kr = 8.
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                     PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor,
+                                     Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (cols % 32 != 0 || depth % 32 != 0) {
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
+    return rhs_pack(blockB, rhs, depth, cols, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
+  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
+  blockB_256++;
+
+  // Pack cols in sets of 32
+  for (Index n = 0; n < cols; n += 32) {
+    // Pack depth in sets of 32
+    for (Index k = 0; k < depth; k += 32) {
+      __m256i R_A = rhs.loadPacket(k, n);
+      __m256i R_B = rhs.loadPacket(k, n + 1);
+      __m256i R_C = rhs.loadPacket(k, n + 2);
+      __m256i R_D = rhs.loadPacket(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 4);
+      R_B = rhs.loadPacket(k, n + 5);
+      R_C = rhs.loadPacket(k, n + 6);
+      R_D = rhs.loadPacket(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 8);
+      R_B = rhs.loadPacket(k, n + 9);
+      R_C = rhs.loadPacket(k, n + 10);
+      R_D = rhs.loadPacket(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 12);
+      R_B = rhs.loadPacket(k, n + 13);
+      R_C = rhs.loadPacket(k, n + 14);
+      R_D = rhs.loadPacket(k, n + 15);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 16);
+      R_B = rhs.loadPacket(k, n + 17);
+      R_C = rhs.loadPacket(k, n + 18);
+      R_D = rhs.loadPacket(k, n + 19);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 20);
+      R_B = rhs.loadPacket(k, n + 21);
+      R_C = rhs.loadPacket(k, n + 22);
+      R_D = rhs.loadPacket(k, n + 23);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 24);
+      R_B = rhs.loadPacket(k, n + 25);
+      R_C = rhs.loadPacket(k, n + 26);
+      R_D = rhs.loadPacket(k, n + 27);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 28);
+      R_B = rhs.loadPacket(k, n + 29);
+      R_C = rhs.loadPacket(k, n + 30);
+      R_D = rhs.loadPacket(k, n + 31);
+      PACK_STEP;
+
+      blockB_256 += 24;
+    }
+  }
+#undef PACK_STEP
+}
+
+// Perform the actual multiplication on packed inputs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  // Use alternate function for weird sizes
+  if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+  // Create result block
+  QInt32* blockO = aligned_new<QInt32>(32 * 32);
+  // Allocating the result block is about 5-10% faster than declaring stack
+  // space.  It is unclear why this is the case.
+  // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
+  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 32 columns
+  for (Index n = 0; n < cols; n += 32) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 32 rows
+    for (Index m = 0; m < rows; m += 32) {
+      // Reset index into blockB
+      Index indexR = n / 32 * depth;
+      // Loop over blocks of 8 on depth
+      for (Index k = 0; k < depth; k += 8) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_AD16 = blockA_256[indexL++];
+        __m256i L_AD24 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+        __m256i L_EH16 = blockA_256[indexL++];
+        __m256i L_EH24 = blockA_256[indexL++];
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+        __m256i R_AH16 = blockB_256[indexR++];
+        __m256i R_AH20 = blockB_256[indexR++];
+        __m256i R_AH24 = blockB_256[indexR++];
+        __m256i R_AH28 = blockB_256[indexR++];
+
+        // This constant is used with madd to convert 16 bit to 32 bit
+        const __m256i ONE = _mm256_set1_epi32(0x00010001);
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET,                                                 \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 1,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 2,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
+                                                                               \
+  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
+  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
+  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
+  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
+  _mm256_store_si256(                                                          \
+      blockO_256 + 4 * OFFSET + 3,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 16);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 17);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 18);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 19);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 20);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 21);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 22);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 23);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 24);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 25);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 26);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 27);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 28);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 29);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 30);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 31);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix
+      Index i = 0;
+      for (Index j = n; j < n + 32; j++) {
+        LinearMapper r0 = res.getLinearMapper(m, j);
+        LinearMapper r1 = res.getLinearMapper(m + 8, j);
+        LinearMapper r2 = res.getLinearMapper(m + 16, j);
+        LinearMapper r3 = res.getLinearMapper(m + 24, j);
+        r0.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
+        r1.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+        r2.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
+        r3.storePacket(
+            0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
+    }
+  }
+  aligned_delete(blockO, 32 * 32);
+}
+
+#endif  // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
new file mode 100644
index 00000000..99894caf
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
+
+
+namespace Eigen {
+namespace internal {
+
+
+// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
+// integers and the rhs using unsigned 8bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
+
+template<bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
+{
+public:
+  typedef QInt8 LhsScalar;
+  typedef QUInt8 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+                  Index rows, Index depth, Index cols, QInt32 alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+             Index rows, Index depth, Index cols, QInt32 alpha,
+             Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
new file mode 100644
index 00000000..18b5085b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
+#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Mat-Vec product
+// Both lhs and rhs are encoded as 8bit signed integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+
+// Mat-Vec product
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QUInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QUInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+
+// Mat-Vec product
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
+{
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  QInt32* res, Index resIncr,
+  QInt8 alpha);
+};
+
+template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
+    Index rows, Index cols,
+    const LhsMapper& lhs,
+    const RhsMapper& rhs,
+    QInt32* res, Index resIncr,
+    QInt8 alpha)
+{
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+
+#endif  // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
new file mode 100644
index 00000000..078be83e
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -0,0 +1,476 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef struct Packet32q8i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet32q8i();
+  Packet32q8i(__m256i val) : val(val) {}
+} Packet32q8i;
+
+typedef struct Packet16q16i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet16q16i();
+  Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
+typedef struct Packet32q8u {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet32q8u();
+  Packet32q8u(__m256i val) : val(val) {}
+} Packet32q8u;
+
+typedef struct Packet16q8i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet16q8i();
+  Packet16q8i(__m128i val) : val(val) {}
+} Packet16q8i;
+
+typedef struct Packet16q8u {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet16q8u();
+  Packet16q8u(__m128i val) : val(val) {}
+} Packet16q8u;
+
+typedef struct Packet8q16i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet8q16i();
+  Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
+typedef struct Packet8q32i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet8q32i();
+  Packet8q32i(__m256i val) : val(val) {}
+} Packet8q32i;
+
+typedef struct Packet4q32i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet4q32i();
+  Packet4q32i(__m128i val) : val(val) {}
+} Packet4q32i;
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet32q8i type;
+  typedef Packet16q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+  typedef Packet32q8u type;
+  typedef Packet16q8u half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+  typedef Packet8q32i type;
+  typedef Packet4q32i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+  };
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet32q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum { size = 32, alignment=Aligned32 };
+};
+template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 16, alignment=Aligned32 };
+};
+template <>
+struct unpacket_traits<Packet32q8u> {
+  typedef QUInt8 type;
+  typedef Packet16q8u half;
+  enum { size = 32, alignment=Aligned32 };
+};
+template <>
+struct unpacket_traits<Packet8q32i> {
+  typedef QInt32 type;
+  typedef Packet4q32i half;
+  enum { size = 8, alignment=Aligned32 };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16(a.val, 0);
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
+  return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
+  return _mm256_extract_epi8(a.val, 0);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
+  return _mm256_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
+  return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
+  return _mm256_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_add_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_sub_epi32(a.val, b.val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_mullo_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_min_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
+                                                  const Packet8q32i& b) {
+  return _mm256_max_epi32(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_min_epu8(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
+                                                  const Packet32q8u& b) {
+  return _mm256_max_epu8(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_min_epi8(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
+                                                  const Packet32q8i& b) {
+  return _mm256_max_epi8(a.val, b.val);
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
+  __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst<Packet8q32i>(
+      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
+                  static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
+  __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epu8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
+                  static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::min(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
+  __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
+  tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return std::max(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
+}
+
+// Vectorized scaling of Packet32q8i by float.
+template<>
+struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
+  typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+#else
+  scalar_product_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; }
+
+  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const {
+    __m256d scale = _mm256_set1_pd(b);
+    __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
+    __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
+    __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
+    __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+  }
+};
+
+template <>
+struct functor_traits<scalar_product_op<QInt32, double>> {
+  enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
new file mode 100644
index 00000000..7a222fdd
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -0,0 +1,545 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+
+#include "PacketMathAVX2.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef struct Packet64q8i {
+  __m512i val;
+  operator __m512i() const { return val; }
+  Packet64q8i();
+  Packet64q8i(__m512i val) : val(val) {}
+} Packet64q8i;
+
+typedef struct Packet32q16i {
+  __m512i val;
+  operator __m512i() const { return val; }
+  Packet32q16i();
+  Packet32q16i(__m512i val) : val(val) {}
+} Packet32q16i;
+
+typedef struct Packet64q8u {
+  __m512i val;
+  operator __m512i() const { return val; }
+  Packet64q8u();
+  Packet64q8u(__m512i val) : val(val) {}
+} Packet64q8u;
+
+typedef struct Packet16q32i {
+  __m512i val;
+  operator __m512i() const { return val; }
+  Packet16q32i();
+  Packet16q32i(__m512i val) : val(val) {}
+} Packet16q32i;
+
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+  typedef Packet64q8i type;
+  typedef Packet32q8i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 64,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+  typedef Packet64q8u type;
+  typedef Packet32q8u half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 64,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet32q16i type;
+  typedef Packet16q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+  typedef Packet16q32i type;
+  typedef Packet8q32i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet64q8i> {
+  typedef QInt8 type;
+  typedef Packet32q8i half;
+  enum { size = 64, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet32q16i> {
+  typedef QInt16 type;
+  typedef Packet16q16i half;
+  enum { size = 32, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet64q8u> {
+  typedef QUInt8 type;
+  typedef Packet32q8u half;
+  enum { size = 64, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet16q32i> {
+  typedef QInt32 type;
+  typedef Packet8q32i half;
+  enum { size = 16, alignment=Aligned64 };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from.val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+                                               from.val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
+  return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
+  return static_cast<uint8_t>(
+           _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
+  return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
+  return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
+  return _mm512_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
+  return _mm512_set1_epi16(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
+  return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
+  return _mm512_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_add_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_sub_epi32(a.val, b.val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_mullo_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
+  return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_min_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
+                                                    const Packet16q32i& b) {
+  return _mm512_max_epi32(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
+                                                  const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epu8(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_min_epu8(ap0, bp0);
+  __m256i r1 = _mm256_min_epu8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
+                                                  const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epu8(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_max_epu8(ap0, bp0);
+  __m256i r1 = _mm256_max_epu8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
+                                                  const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epi8(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_min_epi8(ap0, bp0);
+  __m256i r1 = _mm256_min_epi8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
+                                                    const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_min_epi16(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_min_epi16(ap0, bp0);
+  __m256i r1 = _mm256_min_epi16(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
+                                                  const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epi8(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_max_epi8(ap0, bp0);
+  __m256i r1 = _mm256_max_epi8(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
+                                                    const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_max_epi16(a.val, b.val);
+#else
+  __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+  __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+  __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+  __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+  __m256i r0 = _mm256_max_epi16(ap0, bp0);
+  __m256i r1 = _mm256_max_epi16(ap1, bp1);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
+  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(
+           _mm_min_epi32(
+             res,
+             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
+  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(
+           _mm_max_epi32(
+             res,
+             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
+  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min({
+           static_cast<std::int16_t>(w >> 16),
+           static_cast<std::int16_t>(w)
+         });
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
+  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max({
+           static_cast<std::int16_t>(w >> 16),
+           static_cast<std::int16_t>(w)
+         });
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
+  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min({
+           static_cast<std::uint8_t>(w >> 24),
+           static_cast<std::uint8_t>(w >> 16),
+           static_cast<std::uint8_t>(w >> 8),
+           static_cast<std::uint8_t>(w)
+         });
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
+  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max({
+           static_cast<std::uint8_t>(w >> 24),
+           static_cast<std::uint8_t>(w >> 16),
+           static_cast<std::uint8_t>(w >> 8),
+           static_cast<std::uint8_t>(w)
+         });
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
+  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min({
+           static_cast<std::int8_t>(w >> 24),
+           static_cast<std::int8_t>(w >> 16),
+           static_cast<std::int8_t>(w >> 8),
+           static_cast<std::int8_t>(w)
+         });
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
+  Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+  Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+  Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+  Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+  Packet4i res =
+      _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
+  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  std::uint32_t w =
+      pfirst(
+        _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min({
+           static_cast<std::int8_t>(w >> 24),
+           static_cast<std::int8_t>(w >> 16),
+           static_cast<std::int8_t>(w >> 8),
+           static_cast<std::int8_t>(w)
+         });
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
new file mode 100644
index 00000000..045384d7
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -0,0 +1,66 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef __m256 Packet8f;
+
+template <>
+struct type_casting_traits<QInt32, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
+  return _mm256_cvtepi32_ps(a.val);
+}
+
+template <>
+struct type_casting_traits<float, QInt32> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template <>
+struct type_casting_traits<QInt32, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8i
+pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val),
+                                         _mm256_packs_epi32(c.val, d.val));
+  // Since packs does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q8u
+pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
+                                const Packet8q32i& c, const Packet8q32i& d) {
+  const __m256i converted = _mm256_packus_epi16(
+      _mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val));
+  // Since packus does not cross 128 bit lane boundaries,
+  // we have to permute to properly order the final result.
+  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  return _mm256_permutevar8x32_epi32(converted, permute_mask);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
new file mode 100644
index 00000000..cd7120ec
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -0,0 +1,180 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+
+namespace Eigen {
+namespace internal {
+
+typedef __m512 Packet16f;
+typedef __m512i Packet16i;
+
+template <>
+struct type_casting_traits<QInt32, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
+  return _mm512_cvtepi32_ps(a.val);
+}
+
+template <>
+struct type_casting_traits<float, QInt32> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
+  return _mm512_cvtps_epi32(a);
+}
+
+template <>
+struct type_casting_traits<float, QInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet32q16i
+pcast<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  Packet16i a_int = _mm512_cvtps_epi32(a);
+  Packet16i b_int = _mm512_cvtps_epi32(b);
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_packs_epi32(a_int, b_int);
+#else
+  Packet8i ab_int16_low =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_castsi512_si256(a_int),
+          _mm512_castsi512_si256(b_int)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_extracti32x8_epi32(a_int, 1),
+          _mm512_extracti32x8_epi32(b_int, 1)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(
+           _mm512_castsi256_si512(ab_int16_low),
+           ab_int16_high, 1);
+#endif
+}
+
+template <>
+struct type_casting_traits<float, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i
+pcast<Packet16f>(const Packet16f& a,
+                 const Packet16f& b,
+                 const Packet16f& c,
+                 const Packet16f& d) {
+  Packet16i a_int = _mm512_cvtps_epi32(a);
+  Packet16i b_int = _mm512_cvtps_epi32(b);
+  Packet16i c_int = _mm512_cvtps_epi32(c);
+  Packet16i d_int = _mm512_cvtps_epi32(d);
+#ifdef EIGEN_VECTORIZE_AVX512BW
+  return _mm512_packs_epi16(
+           _mm512_packs_epi32(a_int, b_int),
+           _mm512_packs_epi32(c_int, d_int));
+#else
+  Packet8i ab_int16_low =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_castsi512_si256(a_int),
+          _mm512_castsi512_si256(b_int)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_low =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_castsi512_si256(c_int),
+          _mm512_castsi512_si256(d_int)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_extracti32x8_epi32(a_int, 1),
+          _mm512_extracti32x8_epi32(b_int, 1)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_high =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi32(
+          _mm512_extracti32x8_epi32(c_int, 1),
+          _mm512_extracti32x8_epi32(d_int, 1)),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_low =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi16(ab_int16_low, cd_int16_low),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_high =
+      _mm256_permute4x64_epi64(
+        _mm256_packs_epi16(ab_int16_high, cd_int16_high),
+        _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(
+           _mm512_castsi256_si512(abcd_int8_low),
+           abcd_int8_high, 1);
+#endif
+}
+
+template <>
+struct type_casting_traits<QInt32, QInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<QInt32, QInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i
+pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
+                                 const Packet16q32i& b,
+                                 const Packet16q32i& c,
+                                 const Packet16q32i& d) {
+  __m512i converted = _mm512_packs_epi16(_mm512_packs_epi32(a.val, b.val),
+                                         _mm512_packs_epi32(c.val, d.val));
+  return converted;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32q16i
+pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
+                                  const Packet16q32i& b) {
+  __m512i converted = _mm512_packs_epi32(a.val, b.val);
+  return converted;
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt8> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8u
+pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
+  const __m512i converted = _mm512_packus_epi16(
+      _mm512_packus_epi32(a.val, b.val), _mm512_packus_epi32(c.val, d.val));
+  return converted;
+}
+
+template <>
+struct type_casting_traits<QInt32, QUInt16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+#if 0
+template <>
+EIGEN_STRONG_INLINE Packet32q16u
+pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
+                                  const Packet16q32i& b) {
+  const __m512i converted = _mm512_packus_epi32(a.val, b.val);
+  return converted;
+}
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
new file mode 100644
index 00000000..cbcce9e2
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
+
+namespace Eigen {
+
+/** scalar_sigmoid_fast_derivative_op
+  * \ingroup CXX11_NeuralNetworks_Module
+  * \brief Template functor to compute the fast derivative of a sigmoid
+  *
+  * Input should be the backpropagated gradient.
+  *
+  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+  */
+template <typename T>
+struct scalar_sigmoid_fast_derivative_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return (one - y) * y;
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::pmul(internal::psub(one, y), y);
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/** scalar_tanh_fast_derivative_op
+  * \ingroup CXX11_NeuralNetworks_Module
+  * \brief Template functor to compute the fast derivative of a tanh
+  *
+  * Input should be the backpropagated gradient.
+  *
+  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+  */
+template <typename T>
+struct scalar_tanh_fast_derivative_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
+    const T one = T(1);
+    return one - (y * y);
+  }
+
+  template <typename Packet>
+  inline Packet packetOp(const Packet& y) const {
+    const Packet one = internal::pset1<Packet>(1);
+    return internal::psub(one, internal::pmul(y, y));
+  }
+};
+
+namespace internal {
+template <typename T>
+struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
+                   packet_traits<T>::HasNegate
+  };
+};
+}  // namespace internal
+
+/**
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
+template <typename Scalar>
+struct scalar_clip_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return numext::mini(numext::maxi(a, -b), b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmin(internal::pmax(a, internal::pnegate(b)), b);
+  }
+};
+
+namespace internal {
+template <typename Scalar>
+struct functor_traits<scalar_clip_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost * 3,
+    PacketAccess = packet_traits<Scalar>::HasMax &&
+                   packet_traits<Scalar>::HasMin &&
+                   packet_traits<Scalar>::HasNegate
+  };
+};
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
new file mode 100644
index 00000000..d4bc7a35
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
+
+namespace Eigen {
+
+/** ExtractGlimpses
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Extract glimpses from an input tensor.
+  *
+  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
+  * The width and height parameters specify the extension of the returned glimpses.
+  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
+  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
+  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
+  *
+  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
+  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
+  */
+namespace {
+template <typename Index>
+struct GlimpseExtractionOp {
+  GlimpseExtractionOp(const Index width, const Index height,
+                      const std::vector<IndexPair<float> >& offsets,
+                      const bool normalized,
+                      const bool centered,
+                      const bool uniform_noise) :
+      width_(width), height_(height), offsets_(offsets),
+      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+
+  template <typename Input>
+  DSizes<Index, 4> dimensions(const Input& input) const {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> > Ref;
+    Ref in(input);
+
+    DSizes<Index, 4> dims = in.dimensions();
+
+    dims[0] = in.dimension(0);
+    dims[1] = width_;
+    dims[2] = height_;
+    dims[3] = in.dimension(3);
+    return dims;
+  }
+
+  template <typename Input, typename Output, typename Device>
+  EIGEN_DEVICE_FUNC
+  void eval(const Input& input, Output& output, const Device& device) const
+  {
+    typedef typename internal::traits<Input>::Index IndexType;
+    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
+                             internal::traits<Input>::Layout, IndexType> > Ref;
+    Ref in(input);
+
+    const Index num_channels = in.dimension(0);
+    const Index input_width = in.dimension(1);
+    const Index input_height = in.dimension(2);
+    const Index batch_size = in.dimension(3);
+    eigen_assert(input_width > 0);
+    eigen_assert(input_height > 0);
+
+    for (Index i = 0; i < batch_size; ++i) {
+      float x = offsets_[i].first, y = offsets_[i].second;
+
+      // Un-normalize coordinates back to pixel space if normalized.
+      if (normalized_) {
+        x *= input_width;
+        y *= input_height;
+      }
+      // Un-center if coordinates are centered on the image center.
+      if (centered_) {
+        x /= 2.0f;
+        y /= 2.0f;
+        x += input_width / 2.0f;
+        y += input_height / 2.0f;
+      }
+      // Remove half of the glimpse window.
+      x -= width_ / 2.0f;
+      y -= height_ / 2.0f;
+
+      const Index offset_x = (Index) x;
+      const Index offset_y = (Index) y;
+      Index glimpse_width = width_;
+      Index glimpse_height = height_;
+      bool partial_overlap = false;
+      DSizes<Index, 3> slice_offset(0, offset_x, offset_y);
+      DSizes<Index, 3> slice_extent(num_channels, width_, height_);
+      DSizes<Index, 3> base_offset(0, 0, 0);
+
+      if (offset_x < 0) {
+        slice_offset[1] = 0;
+        glimpse_width = (std::max<Index>)(0, width_ + offset_x);
+        slice_extent[1] = glimpse_width;
+        base_offset[1] = width_ - glimpse_width;
+        partial_overlap = true;
+      } else if (offset_x + width_ >= input_width) {
+        glimpse_width = (std::max<Index>)(0, input_width - offset_x);
+        slice_extent[1] = glimpse_width;
+        partial_overlap = true;
+      }
+      if (offset_y < 0) {
+        slice_offset[2] = 0;
+        glimpse_height = (std::max<Index>)(0, height_ + offset_y);
+        slice_extent[2] = glimpse_height;
+        base_offset[2] = height_ - glimpse_height;
+        partial_overlap = true;
+      } else if (offset_y + height_ >= input_height) {
+        glimpse_height = (std::max<Index>)(0, input_height - offset_y);
+        slice_extent[2] = glimpse_height;
+        partial_overlap = true;
+      }
+      slice_extent[1] = std::min<Index>(input_width, slice_extent[1]);
+      slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
+
+      if (partial_overlap) {
+        if (uniform_noise_) {
+          // Initialize the glimpse with uniform noise.
+          typedef typename internal::remove_const<
+            typename internal::traits<Input>::Scalar>::type Scalar;
+          TensorFixedSize<Scalar, Sizes<> > mini;
+          mini.device(device) = input.template chip<3>(i).minimum();
+          TensorFixedSize<float, Sizes<> > range;
+          range.device(device) =
+              (input.template chip<3>(i).maximum() - mini).template cast<float>();
+
+          DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
+          TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
+          output.template chip<3>(i).device(device) =
+              mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) +
+              (tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast<Scalar>();
+        } else {
+          // Initialize the glimpse with white noise: compute the mean and sigma
+          // of each channel, and use them to shape the gaussian.
+          DSizes<Index, 2> glimpse_size(width_, height_);
+          DSizes<Index, 2> input_size(input_width, input_height);
+          typedef typename internal::remove_const<
+            typename internal::traits<Input>::Scalar>::type Scalar;
+
+          for (int j = 0; j < num_channels; ++j) {
+            TensorFixedSize<Scalar, Sizes<> > mean;
+            mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast<float>().mean();
+            TensorFixedSize<float, Sizes<> > sigma;
+            sigma.device(device) =
+                (input.template chip<3>(i).template chip<0>(j).template cast<float>() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt();
+            TensorFixedSize<Scalar, Sizes<> > mini;
+            mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum();
+            TensorFixedSize<float, Sizes<> > maxi;
+            maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum();
+
+            TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
+            output.template chip<3>(i).template chip<0>(j).device(device) =
+                (mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) +
+                 (tmp.random(internal::NormalRandomGenerator<float>()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast<Scalar>()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size));
+          }
+        }
+
+        // Copy the part of the glimpse that cover the input image if any.
+        if (glimpse_width == 0 || glimpse_height == 0) {
+          continue;
+        }
+        output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
+      } else {
+        output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
+      }
+    }
+  }
+
+ private:
+  const Index width_;
+  const Index height_;
+  const std::vector<IndexPair<float> > offsets_;
+  const bool normalized_;
+  const bool centered_;
+  const bool uniform_noise_;
+};
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+ExtractGlimpses(const Input& input,
+                const typename internal::traits<Input>::Index width,
+                const typename internal::traits<Input>::Index height,
+                const std::vector<IndexPair<float> >& offsets,
+                const bool normalized = true, const bool centered = true,
+                const bool uniform_noise = true)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index Index;
+  const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
+                                      centered, uniform_noise);
+  return input.customOp(op);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
new file mode 100644
index 00000000..12ce2344
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
@@ -0,0 +1,523 @@
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** CuboidConvolutionBackwardInput
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the input of a 3D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
+  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
+  * output_backward and kernel have to be in the same layout.
+  *
+  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  * All dimension orders above are given for col-major, and should be reversed for row-major.
+  */
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorReverseOp<const array<bool, 5>, const Kernel>
+            >,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+            >
+        >
+    >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<OutputBackward>::Index,
+                     internal::traits<OutputBackward>::NumDimensions>,
+        const TensorContractionOp<
+            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
+            const TensorReshapingOp<
+                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+            >,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 3>,
+                const TensorReverseOp<const array<bool, 5>, const Kernel>
+            >
+        >
+    >
+>::type
+CuboidConvolutionBackwardInput(
+    const Kernel& kernel, const OutputBackward& output_backward,
+    typename internal::traits<OutputBackward>::Index inputPlanes,
+    typename internal::traits<OutputBackward>::Index inputRows,
+    typename internal::traits<OutputBackward>::Index inputCols,
+    const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1,
+    const DenseIndex strideCols = 1) {
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+  const TensorRef<const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+  const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2];
+  const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3];
+  const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4];
+
+  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
+  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
+  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
+  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
+
+  // Infer padding type.
+  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
+    // SAME padding.
+    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
+    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
+    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
+
+    forward_pad_z = dz - dz / 2;
+    forward_pad_y = dy - dy / 2;
+    forward_pad_x = dx - dx / 2;
+  } else {
+    // VALID padding.
+    forward_pad_z = 0;
+    forward_pad_y = 0;
+    forward_pad_x = 0;
+  }
+  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
+  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
+  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
+
+  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
+  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
+
+  eigen_assert(padding_ztop >= 0);
+  eigen_assert(padding_zbottom >= 0);
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols.
+  // We need to reverse the kernel along the spatial dimensions.
+  array<bool, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+    kernel_reverse[4] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = false;
+    kernel_reverse[4] = false;
+  }
+
+  DSizes<TensorIndex, 3> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelRows * kernelCols * kernelPlanes;
+  } else {
+    kernel_dims[0] = kernelRows * kernelCols * kernelPlanes;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelFilters;
+  }
+
+  // The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have dimensions:
+  //   out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS)
+  DSizes<TensorIndex, 3> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[2] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[2] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[0] = inputRows * inputCols * inputPlanes;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
+  // output_backward, if this is col-major, and
+  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_planes X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputPlanes;
+    post_contract_dims[2] = inputRows;
+    post_contract_dims[3] = inputCols;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputPlanes;
+    post_contract_dims[NumDims - 3] = inputRows;
+    post_contract_dims[NumDims - 4] = inputCols;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, NumDims> strides;
+  for (int i = 0; i < NumDims; i++) {
+    strides[i] = 1;
+  }
+  if (isColMajor) {
+    strides[1] = stridePlanes;
+    strides[2] = strideRows;
+    strides[3] = strideCols;
+  } else {
+    strides[NumDims - 2] = stridePlanes;
+    strides[NumDims - 3] = strideRows;
+    strides[NumDims - 4] = strideCols;
+  }
+
+  return choose(
+      Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+      kernel.reverse(kernel_reverse)
+          .reshape(kernel_dims)
+          .contract(
+              output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                                     1, 1, 1, stridePlanes, strideRows, strideCols,
+                               padding_ztop, padding_zbottom,
+                               padding_top, padding_bottom,
+                               padding_left, padding_right)
+                  .reshape(pre_contract_dims),
+              contract_dims)
+          .reshape(post_contract_dims),
+      output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
+                                             1, 1, 1, stridePlanes, strideRows, strideCols,
+                       padding_ztop, padding_zbottom,
+                       padding_top, padding_bottom,
+                       padding_left, padding_right)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reverse(kernel_reverse).reshape(kernel_dims),
+                    contract_dims)
+          .reshape(post_contract_dims));
+}
+
+
+/** CuboidConvolutionBackwardKernel
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the filter of a 3D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
+  * output_backward and kernel have to be in the same layout.
+  *
+  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  * All dimension orders above are given for col-major, and should be reversed for row-major.
+  */
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+    internal::traits<OutputBackward>::Layout == ColMajor,
+    const TensorShufflingOp<
+        const array<typename internal::traits<OutputBackward>::Index, 5>,
+        const TensorReverseOp<
+            const array<bool, 5>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
+                const TensorContractionOp<
+                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index, 3>,
+                        const Input>,
+                    const TensorReshapingOp<
+                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
+                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+                    >
+                >
+            >
+        >
+    >,
+    const TensorShufflingOp<
+        const array<typename internal::traits<OutputBackward>::Index, 5>,
+        const TensorReverseOp<
+            const array<bool, 5>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
+                const TensorContractionOp<
+                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
+                    const TensorReshapingOp<
+                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
+                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
+                    >,
+                    const TensorReshapingOp<
+                        const DSizes<typename internal::traits<Input>::Index, 3>,
+                        const Input
+                    >
+                >
+            >
+        >
+    >
+>::type
+CuboidConvolutionBackwardKernel(
+    const Input& input, const OutputBackward& output_backward,
+    typename internal::traits<Input>::Index kernelPlanes,
+    typename internal::traits<Input>::Index kernelRows,
+    typename internal::traits<Input>::Index kernelCols,
+    const DenseIndex stridePlanes = 1,
+    const DenseIndex strideRows = 1,
+    const DenseIndex strideCols = 1) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2);
+  const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3);
+  const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
+
+  const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
+  const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
+
+  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
+  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
+  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
+  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
+
+  // Infer padding type.
+  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
+    // SAME padding.
+    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
+    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
+    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
+
+    forward_pad_z = dz - dz / 2;
+    forward_pad_y = dy - dy / 2;
+    forward_pad_x = dx - dx / 2;
+  } else {
+    // VALID padding.
+    forward_pad_z = 0;
+    forward_pad_y = 0;
+    forward_pad_x = 0;
+  }
+
+  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
+  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
+  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
+
+  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
+  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
+
+  eigen_assert(padding_ztop >= 0);
+  eigen_assert(padding_zbottom >= 0);
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward (with input as the
+  // kernel), it will have dimensions
+  //  (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS
+  DSizes<TensorIndex, 4> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = inputRows * inputCols * inputPlanes;
+    pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[3] = 1;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[3] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[3] = kernelFilters;
+    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
+    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
+    pre_contract_dims[0] = 1;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS
+  DSizes<TensorIndex, 3> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = inputRows * inputCols * inputPlanes;
+    input_dims[2] = 1;
+    for (int i = 4; i < NumDims; ++i) {
+      input_dims[2] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[2] == pre_contract_dims[3]);
+  } else {
+    input_dims[2] = kernelChannels;
+    input_dims[1] = inputRows * inputCols * inputPlanes;
+    input_dims[0] = 1;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      input_dims[0] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[0] == pre_contract_dims[0]);
+  }
+
+  // We will contract along dimensions (1, 2) in in and (1, 3) in out, if
+  // this is col-major.
+  // For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: in.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
+  } else {
+    // row-major: output.patches.contract(in)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  }
+
+  // After the contraction, the kernel will have dimension
+  //   in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols
+  // We will need to shuffle the first two dimensions and reverse the spatial dimensions.
+  // The end shape is:
+  //   out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols
+
+  // This is the shape of the kernel *before* the shuffling.
+  DSizes<TensorIndex, 5> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelChannels;
+    kernel_dims[1] = kernelFilters;
+    kernel_dims[2] = kernelPlanes;
+    kernel_dims[3] = kernelRows;
+    kernel_dims[4] = kernelCols;
+  } else {
+    kernel_dims[0] = kernelCols;
+    kernel_dims[1] = kernelRows;
+    kernel_dims[2] = kernelPlanes;
+    kernel_dims[3] = kernelFilters;
+    kernel_dims[4] = kernelChannels;
+  }
+
+  // Flip filters and channels.
+  array<TensorIndex, 5> kernel_shuffle;
+  if (isColMajor) {
+    kernel_shuffle[0] = 1;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 3;
+    kernel_shuffle[4] = 4;
+  } else {
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 1;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 4;
+    kernel_shuffle[4] = 3;
+  }
+
+  // Reverse the spatial dimensions.
+  array<bool, 5> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+    kernel_reverse[4] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = false;
+    kernel_reverse[4] = false;
+  }
+
+  DSizes<TensorIndex, NumDims> strides;
+  for (int i = 0; i < NumDims; i++) {
+    strides[i] = 1;
+  }
+  if (isColMajor) {
+    strides[1] = stridePlanes;
+    strides[2] = strideRows;
+    strides[3] = strideCols;
+  } else {
+    strides[NumDims - 2] = stridePlanes;
+    strides[NumDims - 3] = strideRows;
+    strides[NumDims - 4] = strideCols;
+  }
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      input.reshape(input_dims)
+          .contract(
+              output_backward.extract_volume_patches(
+                                 inputPlanes, inputRows, inputCols, 1,
+                                 1, 1, stridePlanes, strideRows, strideCols,
+
+                                 padding_ztop, padding_zbottom, padding_top,
+                                 padding_bottom, padding_left, padding_right)
+                  .reshape(pre_contract_dims),
+              contract_dims)
+          .reshape(kernel_dims)
+          .reverse(kernel_reverse)
+          .shuffle(kernel_shuffle),
+      output_backward.extract_volume_patches(
+                         inputPlanes, inputRows, inputCols, 1, 1, 1,
+                         stridePlanes, strideRows, strideCols, padding_ztop,
+                         padding_zbottom, padding_top, padding_bottom,
+                         padding_left, padding_right)
+          .reshape(pre_contract_dims)
+          .contract(input.reshape(input_dims), contract_dims)
+          .reshape(kernel_dims)
+          .reverse(kernel_reverse)
+          .shuffle(kernel_shuffle));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
new file mode 100644
index 00000000..188dc75b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
+
+namespace Eigen {
+
+/** SpatialConvolutionBackwardInput
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the input of a 2D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+
+template <typename OutputBackward, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<OutputBackward>::Layout == ColMajor,
+  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > >,
+  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> > > > >::type
+SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputRows, typename internal::traits<OutputBackward>::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
+  // inserted between consecutive kernel elements in atrous convolution
+  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
+
+  // Computing the forward padding
+  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
+  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
+
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
+
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The kernel has dimensions filters X channels X patch_rows X patch_cols
+  // We need to reverse the kernel along dimensions corresponding to rows and
+  // cols.
+  // TODO(yangke): we can make things slightly faster by collapsing the dimensions
+  // where we don't reverse. Try that once we have a faster compiler.
+  array<bool, 4> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = false;
+    kernel_reverse[3] = false;
+  }
+
+  DSizes<TensorIndex, 3> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelRows * kernelCols;
+    kernel_dims[1] = kernelChannels;
+    kernel_dims[2] = kernelFilters;
+  }
+
+  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward, it will have dimensions
+  //   out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS)
+  DSizes<TensorIndex, 3> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[2] = inputRows * inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[2] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[2] = kernelFilters;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[0] = inputRows * inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
+  // output_backward, if this is col-major, and
+  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: kernel.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  } else {
+    // row-major: output.patches.contract(kernel)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
+  }
+
+  // Post contraction, the dimensions of the input_backprop is
+  //  channels X input_rows X input_cols X OTHERS
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelChannels;
+    post_contract_dims[1] = inputRows;
+    post_contract_dims[2] = inputCols;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelChannels;
+    post_contract_dims[NumDims - 2] = inputRows;
+    post_contract_dims[NumDims - 3] = inputCols;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = out.dimension(i);
+    }
+  }
+
+  return choose(Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
+                kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
+                output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
+}
+
+
+/** SpatialConvolutionBackwardKernel
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Computes the backprop for the filter of a 2D convolution.
+  *
+  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape().
+// This can significantly accelerate SpatialConvolutionBackwardKernel.
+
+template <typename OutputBackward, typename Input>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<OutputBackward>::Layout == ColMajor,
+  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > > > > >,
+  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input> > > > > >::type
+SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelRows, typename internal::traits<Input>::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  // stride and in_stride cannot both be larger than 1
+  eigen_assert(!(stride > 1 && in_stride > 1));
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
+  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1];
+
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1];
+
+  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
+  // inserted between consecutive kernel elements in atrous convolution
+  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  // Computing the forward padding
+  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
+  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
+
+  // TODO: factor out the padding computation.
+  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
+  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
+  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
+  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
+
+  eigen_assert(padding_top >= 0);
+  eigen_assert(padding_left >= 0);
+  eigen_assert(padding_bottom >= 0);
+  eigen_assert(padding_right >= 0);
+
+  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
+  // When we extract the image patches from output_backward (with input as the
+  // kernel), it will have dimensions
+  //  (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS
+  DSizes<TensorIndex, 4> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelFilters;
+    pre_contract_dims[1] = inputRows * inputCols;
+    pre_contract_dims[2] = kernelRows * kernelCols;
+    pre_contract_dims[3] = 1;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[3] *= out.dimension(i);
+    }
+  } else {
+    pre_contract_dims[3] = kernelFilters;
+    pre_contract_dims[2] = inputRows * inputCols;
+    pre_contract_dims[1] = kernelRows * kernelCols;
+    pre_contract_dims[0] = 1;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= out.dimension(i);
+    }
+  }
+
+  // The input has dimensions in_depth X (input_rows * input_cols) X OTHERS
+  DSizes<TensorIndex, 3> input_dims;
+  if (isColMajor) {
+    input_dims[0] = kernelChannels;
+    input_dims[1] = inputRows * inputCols;
+    input_dims[2] = 1;
+    for (int i = 3; i < NumDims; ++i) {
+      input_dims[2] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[2] == pre_contract_dims[3]);
+  } else {
+    input_dims[2] = kernelChannels;
+    input_dims[1] = inputRows * inputCols;
+    input_dims[0] = 1;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      input_dims[0] *= in.dimension(i);
+    }
+    eigen_assert(input_dims[0] == pre_contract_dims[0]);
+  }
+
+  // We will contract along dimensions (1, 2) in in and (1, 3) in out, if
+  // this is col-major.
+  // For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
+  array<IndexPair<TensorIndex>, 2> contract_dims;
+  if (isColMajor) {
+    // col-major: in.contract(output.patches)
+    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
+  } else {
+    // row-major: output.patches.contract(in)
+    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
+    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
+  }
+
+  // After the contraction, the kernel will have dimension
+  // in_depth X out_depth X kernel_rows X kernel_cols
+  // We will need to shuffle the first two dimensions and reverse the latter
+  // two dimensions.
+  // The end shape is
+  // out_depth X in_shape X kernel_rows X kernel_cols
+
+  // This is the shape of the kernel *before* the shuffling.
+  DSizes<TensorIndex, 4> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelChannels;
+    kernel_dims[1] = kernelFilters;
+    kernel_dims[2] = kernelRows;
+    kernel_dims[3] = kernelCols;
+  } else {
+    kernel_dims[0] = kernelCols;
+    kernel_dims[1] = kernelRows;
+    kernel_dims[2] = kernelFilters;
+    kernel_dims[3] = kernelChannels;
+  }
+
+  array<TensorIndex, 4> kernel_shuffle;
+  if (isColMajor) {
+    kernel_shuffle[0] = 1;
+    kernel_shuffle[1] = 0;
+    kernel_shuffle[2] = 2;
+    kernel_shuffle[3] = 3;
+  } else {
+    kernel_shuffle[0] = 0;
+    kernel_shuffle[1] = 1;
+    kernel_shuffle[2] = 3;
+    kernel_shuffle[3] = 2;
+  }
+
+  array<bool, 4> kernel_reverse;
+  if (isColMajor) {
+    kernel_reverse[0] = false;
+    kernel_reverse[1] = false;
+    kernel_reverse[2] = true;
+    kernel_reverse[3] = true;
+  } else {
+    kernel_reverse[0] = true;
+    kernel_reverse[1] = true;
+    kernel_reverse[2] = false;
+    kernel_reverse[3] = false;
+  }
+
+  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle),
+                output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
new file mode 100644
index 00000000..dfb9dced
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
@@ -0,0 +1,179 @@
+#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
+#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** CuboidConvolution
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a 3D convolution over a multichannel input voxel block.
+  *
+  * The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others).
+  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width).
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable).
+  *
+  * The input and kernel have to be in the same layout, and both row-major and
+  * col-major are supported. The shapes given above are for col-major layout.
+  * For row-major, all dimensions should be reversed.
+  *
+  * It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  */
+template <typename Input, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional <
+    internal::traits<Input>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const Kernel>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > > >,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<
+                const DSizes<typename internal::traits<Input>::Index, 2>,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > ,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel> > > >::type
+CuboidConvolution(const Input& input, const Kernel& kernel,
+                  const DenseIndex stridePlanes = 1,
+                  const DenseIndex strideRows = 1,
+                  const DenseIndex strideCols = 1,
+                  const PaddingType padding_type = PADDING_SAME) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result.
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
+
+  // Spatial size of the kernel.
+  const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
+
+  if (isColMajor) {
+    eigen_assert(kernelChannels == in.dimension(0));
+  } else {
+    eigen_assert(kernelChannels == in.dimension(NumDims - 1));
+  }
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  const float stride_planes_f = static_cast<float>(stridePlanes);
+  const float stride_rows_f = static_cast<float>(strideRows);
+  const float stride_cols_f = static_cast<float>(strideCols);
+  TensorIndex out_depth;
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f);
+      out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f);
+      out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f);
+      break;
+    case PADDING_SAME:
+      out_depth = ceil(inputPlanes / stride_planes_f);
+      out_height = ceil(inputRows / stride_rows_f);
+      out_width = ceil(inputCols / stride_cols_f);
+      break;
+    default:
+      eigen_assert(false && "unexpected padding");
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+
+  // Molds the output of the patch extraction result into a 2D tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_depth * out_height * out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_depth * out_height * out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  // Molds the output of the contraction into the shape expected by the user
+  // (assuming ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output depth
+  // - 3nd dim: output height
+  // - 4rd dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_depth;
+    post_contract_dims[2] = out_height;
+    post_contract_dims[3] = out_width;
+    for (int i = 4; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_depth;
+    post_contract_dims[NumDims - 3] = out_height;
+    post_contract_dims[NumDims - 4] = out_width;
+    for (int i = 0; i < NumDims - 4; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+          .contract(input.extract_volume_patches(
+                             kernelDepth, kernelRows, kernelCols, stridePlanes,
+                             strideRows, strideCols, padding_type)
+                        .reshape(pre_contract_dims),
+                    contract_dims)
+          .reshape(post_contract_dims),
+      input.extract_volume_patches(kernelDepth, kernelRows, kernelCols,
+                                   stridePlanes, strideRows, strideCols,
+                                   padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .reshape(post_contract_dims));
+}
+
+} // end namespace Eigen
+
+#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
new file mode 100644
index 00000000..89190eb1
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
@@ -0,0 +1,240 @@
+#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
+#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
+
+#if not defined(__CUDACC__)
+#include <type_traits>
+#endif
+
+namespace Eigen {
+namespace internal {
+
+/** Extract3DPatches
+ * \ingroup CXX11_NeuralNetworksModule
+ *
+ * \brief Extracts 3D patches from a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 or more
+ * (channels, depth, height, width, optional others in col-major, and the
+ * reverse order in row-major).
+
+ * The return value will be a tensor of 3 more dimension than the input tensor.
+ * In col-major, the first 4 dimensions of the result are: channels, patch_depth,
+ * patch_height, patch_width. The next dimensions will identify the patch
+ * position on the 3D grid of extracted patches: z, y, x. The remaining
+ * dimensions, if any, will be the same as the 'other' dimensions of the input
+ * tensor.
+ */
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorStridingOp<
+    const array<typename internal::traits<Input>::Index,
+                internal::traits<Input>::NumDimensions + 3>,
+    const TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions + 3>,
+        const TensorPatchOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorPaddingOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            internal::traits<Input>::NumDimensions>,
+                const Input> > > >
+Extract3DPatches(
+    const Input& input, const DenseIndex patchPlanes,
+    const DenseIndex patchRows, const DenseIndex patchCols,
+    const DenseIndex stridePlanes, const DenseIndex strideRows,
+    const DenseIndex strideCols,
+    const DenseIndex paddingZTop, const DenseIndex paddingZBottom,
+    const DenseIndex paddingTop, const DenseIndex paddingBottom,
+    const DenseIndex paddingLeft, const DenseIndex paddingRight,
+    const typename internal::traits<Input>::Scalar padding_value = 0) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+  static const int ExtDims = NumDims + 3;
+
+  // Tensor size after patch extraction. We add three dimensions to unpack the
+  // linear patch index into a 3D grid over which stride() can work.
+  DSizes<TensorIndex, ExtDims> pre_stride_dims;
+
+  if (isColMajor) {
+    pre_stride_dims[0] = in.dimension(0);
+    pre_stride_dims[1] = patchPlanes;
+    pre_stride_dims[2] = patchRows;
+    pre_stride_dims[3] = patchCols;
+  } else {
+    pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1);
+    pre_stride_dims[ExtDims - 4] = patchCols;
+    pre_stride_dims[ExtDims - 3] = patchRows;
+    pre_stride_dims[ExtDims - 2] = patchPlanes;
+  }
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  array<IndexPair<TensorIndex>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = IndexPair<TensorIndex>(0, 0);
+  }
+
+  paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair<TensorIndex>(paddingZTop, paddingZBottom);
+  paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair<TensorIndex>(paddingTop, paddingBottom);
+  paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair<TensorIndex>(paddingLeft, paddingRight);
+
+  pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1;
+  pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1;
+  pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1;
+
+  if (isColMajor) {
+    for (int i = 7; i < NumDims + 3; ++i) {
+      pre_stride_dims[i] = in.dimension(i - 3);
+    }
+  } else {
+    for (int i = 0; i < NumDims - 4; ++i) {
+      pre_stride_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, NumDims> patch_dims;
+  if (isColMajor) {
+    patch_dims[0] = in.dimension(0);
+    patch_dims[1] = patchPlanes;
+    patch_dims[2] = patchRows;
+    patch_dims[3] = patchCols;
+    for (int i = 4; i < NumDims; ++i) {
+      patch_dims[i] = 1;
+    }
+  } else {
+    patch_dims[NumDims - 1] = in.dimension(NumDims - 1);
+    patch_dims[NumDims - 4] = patchCols;
+    patch_dims[NumDims - 3] = patchRows;
+    patch_dims[NumDims - 2] = patchPlanes;
+    for (int i = 0; i < NumDims - 4; i++) {
+      patch_dims[i] = 1;
+    }
+  }
+
+  array<TensorIndex, NumDims + 3> strides;
+  if (isColMajor) {
+    // No striding within the patches.
+    for (int i = 0; i < 4; ++i) {
+      strides[i] = 1;
+    }
+    // Apply striding in the spatial patch grid dimensions only.
+    strides[4] = stridePlanes;
+    strides[5] = strideRows;
+    strides[6] = strideCols;
+    // No striding in the remaining dimensions (batches, ...).
+    for (int i = 7; i < NumDims + 3; i++) {
+      strides[i] = 1;
+    }
+  } else {
+    // No striding within the patches.
+    for (int i = 1; i <= 4; ++i) {
+      strides[ExtDims - i] = 1;
+    }
+    // Apply striding in the spatial patch grid dimensions only.
+    strides[ExtDims - 7] = strideCols;
+    strides[ExtDims - 6] = strideRows;
+    strides[ExtDims - 5] = stridePlanes;
+    // No striding in the remaining dimensions (batches, ...).
+    for (int i = 0; i < NumDims - 4; i++) {
+      strides[i] = 1;
+    }
+  }
+
+  // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
+  // extract_patches to take additional parameters for padding/striding,
+  // similarly to etract_image_patches.
+  return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorStridingOp<
+    const array<typename internal::traits<Input>::Index,
+                internal::traits<Input>::NumDimensions + 3>,
+    const TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions + 3>,
+        const TensorPatchOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorPaddingOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            internal::traits<Input>::NumDimensions>,
+                const Input> > > >
+Extract3DPatches(
+    const Input& input, const DenseIndex patchPlanes,
+    const DenseIndex patchRows, const DenseIndex patchCols,
+    const DenseIndex stridePlanes, const DenseIndex strideRows,
+    const DenseIndex strideCols, const PaddingType padding_type,
+    const typename internal::traits<Input>::Scalar padding_value = 0) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
+
+  switch (padding_type) {
+    case PADDING_VALID:
+      // No padding in any dimension.
+      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
+                              stridePlanes, strideRows, strideCols,
+                              0, 0, 0, 0, 0, 0, padding_value);
+    case PADDING_SAME: {
+      // The side of the tensor before striding should be just the expected
+      // output times the stride.
+      const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)) * stridePlanes;
+      const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)) * strideRows;
+      const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)) * strideCols;
+
+      // The size of the patch space is going to be: padded_input_size - patch_size + 1.
+      // This has to match the expected size before striding (pre_stride_dims).
+      // The deltas below extend the input to the expected size.
+      const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes;
+      const TensorIndex dy = size_y + patchRows - 1 - inputRows;
+      const TensorIndex dx = size_x + patchCols - 1 - inputCols;
+
+      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
+                              stridePlanes, strideRows, strideCols,
+                              dz - dz / 2, dz / 2,
+                              dy - dy / 2, dy / 2,
+                              dx - dx / 2, dx / 2,
+                              padding_value);
+    }
+    default:
+      eigen_assert(false && "unexpected padding");
+      // unreachable code to avoid missing return warning.
+      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
+                              stridePlanes, strideRows, strideCols,
+                              0, 0, 0, 0, 0, 0, padding_value);
+  }
+}
+
+// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11.
+template <typename Input>
+struct Extract3DPatchesType {
+  typedef const TensorStridingOp< const array<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
+      const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
+      const TensorPatchOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+      const TensorPaddingOp< const array< IndexPair<typename internal::traits<Input>::Index>, internal::traits<Input>::NumDimensions>,
+      const Input> > > > type;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
new file mode 100644
index 00000000..942b060b
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
@@ -0,0 +1,433 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
+
+#include "Patch3d.h"
+
+namespace Eigen {
+
+/** SpatialMaxPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a max-pooling over a multichannel input image.
+  *
+  * The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#endif
+SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
+                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 2> reduction_dims;
+  if (isColMajor) {
+    reduction_dims[0] = 1;
+    reduction_dims[1] = 2;
+  } else {
+    reduction_dims[0] = 2;
+    reduction_dims[1] = 3;
+  }
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
+#endif
+
+  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims);
+}
+
+/** CuboidMaxPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a max-pooling over a multichannel input volume.
+  *
+  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the depth, width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<float>, const Eigen::array<int, 1>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::MaxReducer<float>,
+        const Eigen::IndexList<Eigen::type2index<1> >,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#endif
+CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 1> reduction_dims;
+  reduction_dims[0] = 1;
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+#endif
+  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
+                                      stridePlanes, strideRows, strideCols,
+                                      padding_type, -Eigen::NumTraits<float>::highest())
+      .reshape(pre_reduce_dims)
+      .maximum(reduction_dims)
+      .reshape(post_reduce_dims);
+}
+
+
+/** SpatialAvgPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies an average pooling over a multichannel input image.
+  *
+  * The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the width and height dimensions can be swapped if needed.
+  *
+*/
+namespace internal {
+
+template <typename T> struct AvgPoolMeanReducer
+{
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
+  // We only support packet access for floats.
+  static const bool PacketAccess = internal::is_same<T, float>::value;
+#else
+  static const bool PacketAccess = false;
+#endif
+  static const bool IsStateful = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
+    typedef typename packet_traits<T>::type Packet;
+    packetCount_ = pset1<Packet>(0.0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    if (t != -Eigen::NumTraits<T>::highest()) {
+      (*accum) = (*accum) + t;
+      scalarCount_++;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    eigen_assert(scalarCount_ > 0);
+    return accum / scalarCount_;
+  }
+
+#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
+#ifdef EIGEN_VECTORIZE_AVX
+#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ)
+#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask)
+#else
+#define pequal(a,b) _mm_cmpeq_ps(a,b)
+#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b))
+#endif
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    reducePacketWithType(static_cast<T>(0), p, accum);
+  }
+
+  template <typename Packet>
+  void reducePacketWithType(T, const Packet& p, Packet* accum) {
+    Packet skip_mask = pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
+    (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
+    packetCount_ = padd<Packet>(packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask));
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, packetCount_);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_));
+  }
+#endif
+
+ protected:
+    typedef typename packet_traits<T>::type Packet;
+    int scalarCount_;
+    Packet packetCount_;
+};
+
+}  // namespace internal
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
+#endif
+SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
+                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
+                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
+  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
+
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  static const int idxRows = isColMajor ? 1 : 2;
+  static const int idxCols = isColMajor ? 2 : 1;
+
+  // Molds the output of the reduction into the shape expected by the user.
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[3] = in.dimension(3);
+
+  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 2> reduction_dims;
+  if (isColMajor) {
+    reduction_dims[0] = 1;
+    reduction_dims[1] = 2;
+  } else {
+    reduction_dims[0] = 2;
+    reduction_dims[1] = 3;
+  }
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
+#endif
+  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims);
+}
+
+
+/** CuboidAvgPooling
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies an average pooling over a multichannel input volume.
+  *
+  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major).
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major).
+  *
+  * The order of the depth, width and height dimensions can be swapped if needed.
+  *
+*/
+#if !defined(EIGEN_HAS_INDEX_LIST)
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+    const TensorReductionOp<
+        internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>,
+        const TensorReshapingOp<
+            const Eigen::DSizes<DenseIndex, 3>,
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#else
+template <typename Input>
+EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
+      const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
+      const TensorReductionOp<
+          internal::AvgPoolMeanReducer<float>,
+          const Eigen::IndexList<Eigen::type2index<1> >,
+          const TensorReshapingOp<
+              const Eigen::DSizes<DenseIndex, 3>,
+              const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
+#endif
+CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
+                 DenseIndex patchRows, DenseIndex patchCols,
+                 DenseIndex stridePlanes, DenseIndex strideRows,
+                 DenseIndex strideCols, const PaddingType padding_type) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+
+  static const int idxPlanes = isColMajor ? 1 : 3;
+  static const int idxRows = 2;
+  static const int idxCols = isColMajor ? 3 : 1;
+  // Molds the output of the reduction into the shape expected by the used
+  // (assuming col-major):
+  // - 1st dim: channels
+  // - 2nd dim: outupt depth
+  // - 3rd dim: output height
+  // - 4th dim: output width
+  // - 5th dim and beyond: everything else including batch size
+  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
+  post_reduce_dims[0] = in.dimension(0);
+  if (padding_type == PADDING_VALID) {
+    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
+  } else {
+    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
+    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
+    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
+  }
+  post_reduce_dims[4] = in.dimension(4);
+
+  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
+  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
+  if (isColMajor) {
+    pre_reduce_dims[0] = post_reduce_dims[0];
+    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
+  } else {
+    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
+    pre_reduce_dims[2] = post_reduce_dims[4];
+  }
+
+  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
+  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  // nvcc doesn't support cxx11
+  Eigen::array<int, 1> reduction_dims;
+  reduction_dims[0] = 1;
+#else
+  // Take advantage of cxx11 to give the compiler information it can use to
+  // optimize the code.
+  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
+#endif
+  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
+                                      stridePlanes, strideRows, strideCols,
+                                      padding_type, -Eigen::NumTraits<float>::highest())
+      .reshape(pre_reduce_dims)
+      .reduce(reduction_dims, mean_with_nan)
+      .reshape(post_reduce_dims);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
new file mode 100644
index 00000000..f0e21ab9
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
+
+namespace Eigen {
+
+/** SoftMax
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a softmax
+  *
+  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
+  *
+  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
+  *
+*/
+
+namespace {
+class SoftmaxOp {
+ public:
+  EIGEN_ALWAYS_INLINE SoftmaxOp(const float beta) : beta_(beta) { }
+
+  template <typename Input> EIGEN_ALWAYS_INLINE
+  typename Input::Dimensions dimensions(const Input& input) const {
+    return input.dimensions();
+  }
+
+  template <typename Input, typename Output, typename Device>
+  void eval(const Input& input, Output& output, const Device& device) const
+  {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    // nvcc doesn't support cxx11
+    Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
+    depth_dim[0] = 0;
+    Eigen::array<typename internal::traits<Input>::Index, 2> bcast;
+    bcast[0] = dimensions(input)[0];
+    bcast[1] = 1;
+    DSizes<typename internal::traits<Input>::Index, 2> dims2d;
+    dims2d[0] = 1;
+    dims2d[1] = dimensions(input)[1];
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    bcast.set(0, dimensions(input)[0]);
+    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    dims2d.set(1, dimensions(input)[1]);
+#endif
+
+    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
+    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+  }
+
+ private:
+  const float beta_;
+};
+}
+
+
+template <typename Input>
+EIGEN_ALWAYS_INLINE
+static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
+SoftMax(const Input& input, const float beta)
+{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const SoftmaxOp op(beta);
+  return input.customOp(op);
+}
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
new file mode 100644
index 00000000..8e2ddca6
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
@@ -0,0 +1,775 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
+#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// These optimizations require vector instructions
+#ifdef EIGEN_VECTORIZE
+
+// TODO: Consolidate this part of the code with the image patch extraction code
+// since they are both very similar.
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
+          typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t,
+          int Side, size_t packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+{
+ public:
+  typedef TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef Scalar_ Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  TensorContractionInputMapper(const TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>& tensor,
+                               const nocontract_t&, const nocontract_t&,
+                               const contract_t&, const contract_t&)
+      : m_impl(tensor.impl().impl())
+  {
+    Index patch_rows;
+    Index patch_depth;
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      patch_depth = tensor.impl().dimensions()[0];
+      patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    } else {
+      static const int NumDims = tensor.impl().dimensions().size();
+      patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    } else {
+      static const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = patch_depth;
+    m_colInputStride = patch_depth * m_inputRows;
+    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
+  }
+
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper) :
+      m_impl(base_mapper.m_impl) {
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+  }
+
+ // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+ private:
+  friend class TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows |
+        (inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+    else {
+      const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+      const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols | inputCols[1] < 0) {
+        // all zeros
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows | inputRows[1] < 0) {
+          // all zeros
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) {
+          // no padding
+          const Index depth = patchId - patchOffsets[0] * patchDepth();
+          const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+          return m_impl.template packet<Unaligned>(inputIndex);
+        }
+      }
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1)  / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index rowOffset = patchOffset - colOffset*m_colStride;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    // no padding
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
+  {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX typename internal::remove_const<Scalar>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const {
+    const int NumInputDims = array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  Index m_patch_cols;    // number of colums in the patch
+  Index m_num_patches;   // number of patches to extract.
+  Index m_patch_row_inflate_strides;  // the strides for row inflation in the image patch
+  Index m_patch_col_inflate_strides;  // the strides for col inflation in the image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;     // row stride in the input tensor
+  Index m_colInputStride;     // col stride in the input tensor
+  Index m_patchInputStride;   // patch stride in the input tensor
+
+  Index m_inputRows;     // Number of rows in the input tensor
+  Index m_inputCols;     // Number of cols in the input tensor
+
+  Index m_outputRows;    // Number of patch rows
+
+  Index m_row_strides;   // User specified row stride
+  Index m_col_strides;   // User specified col stride
+
+  Index m_in_row_strides;  // User specified input row stride
+  Index m_in_col_strides;  // User specified input col stride
+
+  Index m_rowPaddingTop;    // Row padding
+  Index m_colPaddingLeft;   // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
+          typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t,
+          int Side, size_t packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+{
+ public:
+  typedef Scalar_ Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_depth_offset(vert_offset), m_col_offset(horiz_offset) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper.m_base_mapper), m_depth_offset(vert_offset+base_mapper.m_depth_offset), m_col_offset(horiz_offset+base_mapper.m_col_offset) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+   return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return m_base_mapper.template loadPacket(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const {
+    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
+   return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const {
+   return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_base_mapper.nonStandardPatches();
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 | r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 | c >= m_base_mapper.m_inputCols;
+    }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return patchOffset-colOffset*m_base_mapper.m_colStride;
+    }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+    }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    const Index patchOffset = m_depth_offset % m_base_mapper.patchDepth();
+    return patchOffset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+ }
+
+ private:
+  const ParentMapper& m_base_mapper;  // that was a reference before
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  Index m_rowIndex;        // precomputed row index corresponding to the col offset
+  Index m_colIndex;        // precomputed col index corresponding to the col offset
+  Index m_otherIndex;      // precomputed other index corresponding to the col offset
+
+};
+
+
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t,
+          int Side, size_t packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<Scalar, Index, TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> {
+
+  typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper DataMapper;
+
+  static inline Index ceil_div(Index a, Index b) {
+    return (a + b - 1) / b;
+  }
+
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typedef typename DataMapper::LinearMapper LinearMapper;
+    typedef typename packet_traits<Scalar>::type Packet;
+
+    const Index packet_cols4 = (cols/4) * 4;
+    const Index peeled_k = (depth/packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for(Index j2=0; j2<packet_cols4; j2+=4)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((packet_size%4)==0 && !non_standard_patches)
+      {
+        const Index patch_depth = rhs.patchDepth();
+        if ((patch_depth % packet_size) == 0) {
+          const Index patch_cols = rhs.patchCols();
+          const Index patch_rows = rhs.patchRows();
+
+          const Index startCol = rhs.colOffset();
+          const Index max_cols = std::min<Index>(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols);
+
+          for (Index c = startCol; c < max_cols; ++c) {
+            eigen_assert(k < peeled_k);
+            const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
+            const Index max_rows = std::min<Index>(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+            for (Index r = startRow; r < max_rows; ++r) {
+              eigen_assert(k < peeled_k);
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
+              const Index max_depth = std::min<Index>(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth);
+              eigen_assert(max_depth % packet_size == 0);
+              for (Index d = startDepth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block+0*packet_size, kernel.packet[0]);
+                pstoreu(block+1*packet_size, kernel.packet[1]);
+                pstoreu(block+2*packet_size, kernel.packet[2]);
+                pstoreu(block+3*packet_size, kernel.packet[3]);
+                block+=4*packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          for(; k<peeled_k; k+=packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketFast(k);
+            kernel.packet[1] = dm1.loadPacketFast(k);
+            kernel.packet[2] = dm2.loadPacketFast(k);
+            kernel.packet[3] = dm3.loadPacketFast(k);
+            ptranspose(kernel);
+            pstoreu(block+0*packet_size, kernel.packet[0]);
+            pstoreu(block+1*packet_size, kernel.packet[1]);
+            pstoreu(block+2*packet_size, kernel.packet[2]);
+            pstoreu(block+3*packet_size, kernel.packet[3]);
+            block+=4*packet_size;
+          }
+        }
+        else {
+          for(; k<peeled_k; k+=packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block+0*packet_size, kernel.packet[0]);
+            pstoreu(block+1*packet_size, kernel.packet[1]);
+            pstoreu(block+2*packet_size, kernel.packet[2]);
+            pstoreu(block+3*packet_size, kernel.packet[3]);
+            block+=4*packet_size;
+          }
+        }
+      }
+      if (!rhs.nonStandardPatches()) {
+        for(; k<depth; k++)
+        {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      }
+      else {
+        for(; k<depth; k++)
+        {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for(Index k=0; k<depth; k++)
+      {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+#endif  // EIGEN_VECTORIZE
+}  // end namespace internal
+
+
+/** SpatialConvolution
+  * \ingroup CXX11_NeuralNetworks_Module
+  *
+  * \brief Applies a 2D convolution over a multichannel input image.
+  *
+  * The input parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
+  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
+  * The input and the kernel must both be in col-major layout. The result will also be in col-major layout.
+  *
+  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
+  *
+  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable).
+  *
+  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
+  *
+  */
+template <typename Input, typename Kernel>
+EIGEN_ALWAYS_INLINE
+static const typename internal::conditional<
+  internal::traits<Input>::Layout == ColMajor,
+  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >,
+  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel> > > >::type
+SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) {
+
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  static const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
+  const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast<float>(stride));
+      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast<float>(stride));
+      break;
+    case PADDING_SAME:
+      out_height = numext::ceil(InputRows / static_cast<float>(stride));
+      out_width = numext::ceil(InputCols / static_cast<float>(stride));
+      break;
+    default:
+      eigen_assert(false && "unexpected padding");
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
+  // moving it to somewhere more "common".
+  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
+                input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
new file mode 100644
index 00000000..0e721735
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
@@ -0,0 +1,289 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
+
+namespace Eigen {
+
+/** \class TensorConvolutionByFFT
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionByFFTOp : public TensorBase<TensorConvolutionByFFTOp<Indices, InputXprType, KernelXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
+                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionByFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType> XprType;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
+                TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    m_kernelImpl.evalSubExprsIfNeeded(NULL);
+
+    typedef typename internal::traits<InputArgType>::Index TensorIndex;
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> input(m_inputImpl.dimensions());
+    for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) {
+      input.data()[i] = m_inputImpl.coeff(i);
+    }
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel(m_kernelImpl.dimensions());
+    for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) {
+      kernel.data()[i] = m_kernelImpl.coeff(i);
+    }
+
+    array<std::pair<ptrdiff_t, ptrdiff_t>, NumDims> paddings;
+    for (int i = 0; i < NumDims; ++i) {
+      paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]);
+    }
+
+    Eigen::array<bool, NumKernelDims> reverse;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      reverse[i] = true;
+    }
+
+    Eigen::array<bool, NumDims> fft;
+    for (int i = 0; i < NumDims; ++i) {
+      fft[i] = i;
+    }
+
+    Eigen::DSizes<TensorIndex, NumDims> slice_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1;
+    }
+
+    Eigen::DSizes<TensorIndex, NumDims> slice_extents;
+    for (int i = 0; i < NumDims; ++i) {
+      slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1;
+    }
+
+    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel_variant =  kernel.reverse(reverse).pad(paddings);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> kernel_fft =  kernel_variant.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
+    //Tensor<std::complex<Scalar>, NumDims, Layout|IndexType> kernel_fft =  kernel.reverse(reverse).pad(paddings).template fft<2>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> input_fft = input.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft<Eigen::BothParts, FFT_REVERSE>(fft);
+    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents);
+
+    for (int i = 0; i < tensor_result.size(); ++i) {
+      data[i] = std::real(tensor_result.data()[i]);
+    }
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
diff --git a/third_party/eigen3/unsupported/Eigen/SpecialFunctions b/third_party/eigen3/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 00000000..ad13359a
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1 @@
+#include "unsupported/Eigen/SpecialFunctions"
diff --git a/third_party/gemmlowp/LICENSE b/third_party/gemmlowp/LICENSE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/third_party/gemmlowp/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
-- 
GitLab