From 8dd9d6b1f6ccc2742692f81e78f3f1c326e4af84 Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Mon, 28 May 2018 20:00:51 +0800 Subject: [PATCH] Add kernels benchmark against eigen --- WORKSPACE | 34 +- mace/kernels/BUILD | 89 +- mace/kernels/matmul_benchmark.cc | 95 + mace/kernels/memory_benchmark.cc | 111 + third_party/eigen3/BUILD | 33 + third_party/eigen3/Eigen/Cholesky | 1 + third_party/eigen3/Eigen/Core | 1 + third_party/eigen3/Eigen/Eigenvalues | 1 + third_party/eigen3/Eigen/LU | 1 + third_party/eigen3/Eigen/QR | 1 + third_party/eigen3/Eigen/SVD | 1 + third_party/eigen3/LICENSE | 1936 +++++++++++++++++ third_party/eigen3/eigen.BUILD | 71 + .../eigen3/unsupported/Eigen/CXX11/Core | 46 + .../eigen3/unsupported/Eigen/CXX11/FixedPoint | 55 + .../unsupported/Eigen/CXX11/NeuralNetworks | 35 + .../eigen3/unsupported/Eigen/CXX11/Tensor | 15 + .../eigen3/unsupported/Eigen/CXX11/ThreadPool | 1 + .../CXX11/src/FixedPoint/FixedPointTypes.h | 342 +++ .../CXX11/src/FixedPoint/MatMatProduct.h | 255 +++ .../CXX11/src/FixedPoint/MatMatProductAVX2.h | 1754 +++++++++++++++ .../CXX11/src/FixedPoint/MatMatProductNEON.h | 95 + .../CXX11/src/FixedPoint/MatVecProduct.h | 123 ++ .../CXX11/src/FixedPoint/PacketMathAVX2.h | 476 ++++ .../CXX11/src/FixedPoint/PacketMathAVX512.h | 545 +++++ .../CXX11/src/FixedPoint/TypeCastingAVX2.h | 66 + .../CXX11/src/FixedPoint/TypeCastingAVX512.h | 180 ++ .../CXX11/src/NeuralNetworks/Activations.h | 116 + .../CXX11/src/NeuralNetworks/Attention.h | 209 ++ .../BackwardCuboidConvolutions.h | 523 +++++ .../BackwardSpatialConvolutions.h | 351 +++ .../src/NeuralNetworks/CuboidConvolution.h | 179 ++ .../Eigen/CXX11/src/NeuralNetworks/Patch3d.h | 240 ++ .../Eigen/CXX11/src/NeuralNetworks/Pooling.h | 433 ++++ .../Eigen/CXX11/src/NeuralNetworks/SoftMax.h | 83 + .../src/NeuralNetworks/SpatialConvolutions.h | 775 +++++++ .../NeuralNetworks/TensorConvolutionByFFT.h | 289 +++ .../eigen3/unsupported/Eigen/SpecialFunctions | 1 + third_party/gemmlowp/LICENSE | 202 ++ 39 files changed, 9739 insertions(+), 25 deletions(-) create mode 100644 mace/kernels/matmul_benchmark.cc create mode 100644 mace/kernels/memory_benchmark.cc create mode 100644 third_party/eigen3/BUILD create mode 100644 third_party/eigen3/Eigen/Cholesky create mode 100644 third_party/eigen3/Eigen/Core create mode 100644 third_party/eigen3/Eigen/Eigenvalues create mode 100644 third_party/eigen3/Eigen/LU create mode 100644 third_party/eigen3/Eigen/QR create mode 100644 third_party/eigen3/Eigen/SVD create mode 100644 third_party/eigen3/LICENSE create mode 100644 third_party/eigen3/eigen.BUILD create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/Core create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/Tensor create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h create mode 100644 third_party/eigen3/unsupported/Eigen/SpecialFunctions create mode 100644 third_party/gemmlowp/LICENSE diff --git a/WORKSPACE b/WORKSPACE index 9b620936..e62557bd 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -9,7 +9,7 @@ http_archive( strip_prefix = "protobuf-3.4.0", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0.zip", - "https://github.com/google/protobuf/archive/v3.4.0.zip" + "https://github.com/google/protobuf/archive/v3.4.0.zip", ], ) @@ -20,7 +20,7 @@ new_http_archive( strip_prefix = "googletest-release-1.8.0", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/googletest/googletest-release-1.8.0.zip", - "https://github.com/google/googletest/archive/release-1.8.0.zip" + "https://github.com/google/googletest/archive/release-1.8.0.zip", ], ) @@ -31,7 +31,7 @@ new_http_archive( strip_prefix = "OpenCL-Headers-master", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-Headers/OpenCL-Headers-master.zip", - "https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip" + "https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip", ], ) @@ -42,7 +42,7 @@ new_http_archive( strip_prefix = "OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12.zip", - "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip" + "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip", ], ) @@ -53,7 +53,29 @@ new_http_archive( strip_prefix = "half-code-356-trunk", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-code-356-trunk.zip", - "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip" + "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip", + ], +) + +new_http_archive( + name = "eigen", + build_file = "third_party/eigen3/eigen.BUILD", + sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4", + strip_prefix = "eigen-eigen-f3a22f35b044", + urls = [ + "http://cnbj1.fds.api.xiaomi.com/mace/third-party/eigen/f3a22f35b044.tar.gz", + "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz", + "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz", + ], +) + +http_archive( + name = "gemmlowp", + sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658", + strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98", + urls = [ + "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/38ebac7b059e84692f53e5938f97a9943c120d98.zip", + "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip", ], ) @@ -81,7 +103,7 @@ http_archive( strip_prefix = "gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e", urls = [ "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gflags/gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip", - "https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip" + "https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip", ], ) diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 3e837d85..a1200cfa 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -18,14 +18,17 @@ cc_library( ], exclude = [ "*_test.cc", + "*_benchmark.cc", "arm/*_test.cc", ], - ) + if_android(glob([ + ) + if_android(glob( + [ "opencl/*.cc", ], exclude = [ "opencl/*_test.cc", - ])), + ], + )), hdrs = glob( [ "*.h", @@ -35,16 +38,26 @@ cc_library( "buffer_to_image.h", ], ) + if_android(glob([ - "opencl/*.h", - "buffer_to_image.h", - ])), - copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + - if_openmp_enabled(["-fopenmp"]) + - if_neon_enabled(["-DMACE_ENABLE_NEON"]) + - if_android_armv7(["-mfpu=neon"]) + - if_android_armv7(["-mfloat-abi=softfp"]) + - if_android(["-DMACE_ENABLE_OPENCL"]) + - if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), + "opencl/*.h", + "buffer_to_image.h", + ])), + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", + ] + if_openmp_enabled([ + "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", + ]) + if_android([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), linkopts = if_android(["-lm"]), deps = [ "//mace/core", @@ -62,13 +75,22 @@ cc_test( "opencl/*_test.cc", ], ), - copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] + - if_openmp_enabled(["-fopenmp"]) + - if_neon_enabled(["-DMACE_ENABLE_NEON"]) + - if_android_armv7(["-mfpu=neon"]) + - if_android_armv7(["-mfloat-abi=softfp"]) + - if_android(["-DMACE_ENABLE_OPENCL"]) + - if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", + ] + if_openmp_enabled([ + "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", + ]) + if_android([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), linkopts = ["-fopenmp"], linkstatic = 1, deps = [ @@ -77,3 +99,32 @@ cc_test( "@gtest//:gtest_main", ], ) + +cc_test( + name = "kernels_benchmark", + testonly = 1, + srcs = glob(["*_benchmark.cc"]), + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", + ] + if_openmp_enabled([ + "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", + ]) + if_android([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), + linkopts = ["-fopenmp"], + linkstatic = 1, + deps = [ + ":kernels", + "//mace/core:test_benchmark_main", + "//third_party/eigen3", + ], +) diff --git a/mace/kernels/matmul_benchmark.cc b/mace/kernels/matmul_benchmark.cc new file mode 100644 index 00000000..d06ff317 --- /dev/null +++ b/mace/kernels/matmul_benchmark.cc @@ -0,0 +1,95 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/kernels/gemm.h" +#include "public/gemmlowp.h" + +namespace mace { +namespace kernels { +namespace test { + +// Test the speed of different access order of a NHWC buffer + +namespace { + +// Matmul with (m, k) x (k, n) +void MatmulBenchmark_Mace(int iters, int m, int k, int n) { + mace::testing::StopTiming(); + std::vector lhs(m * k); + std::vector rhs(k * n); + std::vector result(m * n); + // warm up + Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data()); + mace::testing::StartTiming(); + while (iters--) { + Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data()); + } +} + +void MatmulBenchmark_Eigen(int iters, int m, int k, int n) { + mace::testing::StopTiming(); + Eigen::MatrixXd lhs = Eigen::MatrixXd::Random(m, k); + Eigen::MatrixXd rhs = Eigen::MatrixXd::Random(k, n); + Eigen::MatrixXd result = Eigen::MatrixXd::Zero(m, n); + // warm up + result = lhs * rhs; + mace::testing::StartTiming(); + while (iters--) { + result = lhs * rhs; + } +} + +} // namespace + +#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC) \ + static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \ + const int64_t macc = static_cast(iters) * M * K * N; \ + const int64_t tot = static_cast(iters) * (M + N) * K; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + MatmulBenchmark_##FUNC(iters, M, K, N); \ + } \ + MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC) + +#define MACE_BM_MATMUL(M, K, N) \ + MACE_BM_MATMUL_FUNC(M, K, N, Mace); \ + MACE_BM_MATMUL_FUNC(M, K, N, Eigen); + +// Embedding size 384 +MACE_BM_MATMUL(7, 384, 384); +MACE_BM_MATMUL(7, 384, 1536); +MACE_BM_MATMUL(7, 1536, 384); + +MACE_BM_MATMUL(15, 384, 384); +MACE_BM_MATMUL(15, 384, 1536); +MACE_BM_MATMUL(15, 1536, 384); + +MACE_BM_MATMUL(1, 384, 384); +MACE_BM_MATMUL(1, 384, 1536); +MACE_BM_MATMUL(1, 1536, 384); +MACE_BM_MATMUL(1, 384, 44678); + +// Embedding size 128 +MACE_BM_MATMUL(1, 128, 1536); +MACE_BM_MATMUL(1, 128, 44678); + +} // namespace test +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/memory_benchmark.cc b/mace/kernels/memory_benchmark.cc new file mode 100644 index 00000000..5d9ab1f4 --- /dev/null +++ b/mace/kernels/memory_benchmark.cc @@ -0,0 +1,111 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "mace/core/testing/test_benchmark.h" + +namespace mace { +namespace kernels { +namespace test { + +// Test the speed of different access order of a NHWC buffer + +namespace { +void MemoryAccessBenchmark_NHWC( + int iters, int batch, int height, int width, int channels) { + mace::testing::StopTiming(); + std::vector buffer(batch * height * width * channels); + std::fill_n(buffer.begin(), buffer.size(), 0.1); + mace::testing::StartTiming(); + + while (iters--) { + for (int n = 0; n < batch; ++n) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + buffer[n * height * width * channels + h * width * channels + + w * channels + c] = 1.0f; + } + } + } + } + } +} + +void MemoryAccessBenchmark_NWCH( + int iters, int batch, int height, int width, int channels) { + mace::testing::StopTiming(); + std::vector buffer(batch * height * width * channels); + std::fill_n(buffer.begin(), buffer.size(), 0.1); + mace::testing::StartTiming(); + + while (iters--) { + for (int n = 0; n < batch; ++n) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + for (int h = 0; h < height; ++h) { + buffer[n * height * width * channels + h * width * channels + + w * channels + c] = 1.0f; + } + } + } + } + } +} + +void MemoryAccessBenchmark_NHCW( + int iters, int batch, int height, int width, int channels) { + mace::testing::StopTiming(); + std::vector buffer(batch * height * width * channels); + std::fill_n(buffer.begin(), buffer.size(), 0.1); + mace::testing::StartTiming(); + + while (iters--) { + for (int n = 0; n < batch; ++n) { + for (int h = 0; h < height; ++h) { + for (int c = 0; c < channels; ++c) { + for (int w = 0; w < width; ++w) { + buffer[n * height * width * channels + h * width * channels + + w * channels + c] = 1.0f; + } + } + } + } + } +} + +} // namespace + +#define MACE_BM_MEMORY_ACCESS(N, H, W, C, ORDER) \ + static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \ + } \ + MACE_BENCHMARK(MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER) + +MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHWC); +MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHCW); +MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NWCH); +MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NHCW); +MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NWCH); + +} // namespace test +} // namespace kernels +} // namespace mace diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD new file mode 100644 index 00000000..89e25857 --- /dev/null +++ b/third_party/eigen3/BUILD @@ -0,0 +1,33 @@ +# Description: +# Eigen is a C++ template library for linear algebra: vectors, +# matrices, and related algorithms. +# This file is mostly stolen from tensorflow. + +licenses([ + # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code. + # We've taken special care to not reference any restricted code. + "reciprocal", # MPL2 + "notice", # Portions BSD +]) + +exports_files(["LICENSE"]) + +cc_library( + name = "eigen3", + hdrs = glob(["unsupported/Eigen/CXX11/src/FixedPoint/*.h"]) + [ + "Eigen/Core", + "Eigen/LU", + "Eigen/Cholesky", + "Eigen/Eigenvalues", + "Eigen/QR", + "Eigen/SVD", + "unsupported/Eigen/SpecialFunctions", + "unsupported/Eigen/CXX11/ThreadPool", + "unsupported/Eigen/CXX11/Tensor", + "unsupported/Eigen/CXX11/FixedPoint", + ], + visibility = ["//visibility:public"], + deps = [ + "@eigen//:eigen", + ], +) diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky new file mode 100644 index 00000000..c199a025 --- /dev/null +++ b/third_party/eigen3/Eigen/Cholesky @@ -0,0 +1 @@ +#include "Eigen/Cholesky" diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core new file mode 100644 index 00000000..d4b03677 --- /dev/null +++ b/third_party/eigen3/Eigen/Core @@ -0,0 +1 @@ +#include "Eigen/Core" diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues new file mode 100644 index 00000000..bf739b9b --- /dev/null +++ b/third_party/eigen3/Eigen/Eigenvalues @@ -0,0 +1 @@ +#include "Eigen/Eigenvalues" diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU new file mode 100644 index 00000000..536149ce --- /dev/null +++ b/third_party/eigen3/Eigen/LU @@ -0,0 +1 @@ +#include "Eigen/LU" diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR new file mode 100644 index 00000000..be067d3e --- /dev/null +++ b/third_party/eigen3/Eigen/QR @@ -0,0 +1 @@ +#include "Eigen/QR" diff --git a/third_party/eigen3/Eigen/SVD b/third_party/eigen3/Eigen/SVD new file mode 100644 index 00000000..eecf47c1 --- /dev/null +++ b/third_party/eigen3/Eigen/SVD @@ -0,0 +1 @@ +#include "Eigen/SVD" diff --git a/third_party/eigen3/LICENSE b/third_party/eigen3/LICENSE new file mode 100644 index 00000000..a25d8e6f --- /dev/null +++ b/third_party/eigen3/LICENSE @@ -0,0 +1,1936 @@ +Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links: + http://www.mozilla.org/MPL/2.0/ + http://www.mozilla.org/MPL/2.0/FAQ.html + +Some files contain third-party code under BSD or LGPL licenses, whence +the other COPYING.* files here. + +All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later. +For this reason, the COPYING.LGPL file contains the LGPL 2.1 text. + +If you want to guarantee that the Eigen code that you are #including +is licensed under the MPL2 and possibly more permissive licenses (like +BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY +For example, with most compilers, you could add this to your project + CXXFLAGS: -DEIGEN_MPL2_ONLY +This will cause a compilation error to be generated if you #include +any code that is LGPL licensed. + +---------------------------------------------------------------------- +Following applies to: +./test/mapstaticmethods.cpp +./test/schur_real.cpp +./test/prec_inverse_4x4.cpp +./test/smallvectors.cpp +./test/redux.cpp +./test/special_numbers.cpp +./test/adjoint.cpp +./test/resize.cpp +./test/mixingtypes.cpp +./test/product_trmv.cpp +./test/sparse_solvers.cpp +./test/cholesky.cpp +./test/geo_quaternion.cpp +./test/miscmatrices.cpp +./test/stddeque.cpp +./test/integer_types.cpp +./test/product_large.cpp +./test/eigensolver_generic.cpp +./test/householder.cpp +./test/geo_orthomethods.cpp +./test/array_for_matrix.cpp +./test/sparseLM.cpp +./test/upperbidiagonalization.cpp +./test/nomalloc.cpp +./test/packetmath.cpp +./test/jacobisvd.cpp +./test/geo_transformations.cpp +./test/swap.cpp +./test/eigensolver_selfadjoint.cpp +./test/inverse.cpp +./test/product_selfadjoint.cpp +./test/product_trsolve.cpp +./test/product_extra.cpp +./test/sparse_solver.h +./test/mapstride.cpp +./test/mapped_matrix.cpp +./test/geo_eulerangles.cpp +./test/eigen2support.cpp +./test/denseLM.cpp +./test/stdvector.cpp +./test/nesting_ops.cpp +./test/sparse_permutations.cpp +./test/zerosized.cpp +./test/exceptions.cpp +./test/vectorwiseop.cpp +./test/cwiseop.cpp +./test/basicstuff.cpp +./test/product_trmm.cpp +./test/linearstructure.cpp +./test/sparse_product.cpp +./test/stdvector_overload.cpp +./test/stable_norm.cpp +./test/umeyama.cpp +./test/unalignedcount.cpp +./test/triangular.cpp +./test/product_mmtr.cpp +./test/sparse_basic.cpp +./test/sparse_vector.cpp +./test/meta.cpp +./test/real_qz.cpp +./test/ref.cpp +./test/eigensolver_complex.cpp +./test/cholmod_support.cpp +./test/conjugate_gradient.cpp +./test/sparse.h +./test/simplicial_cholesky.cpp +./test/bicgstab.cpp +./test/dynalloc.cpp +./test/product_notemporary.cpp +./test/geo_hyperplane.cpp +./test/lu.cpp +./test/qr.cpp +./test/hessenberg.cpp +./test/sizeof.cpp +./test/main.h +./test/selfadjoint.cpp +./test/permutationmatrices.cpp +./test/superlu_support.cpp +./test/qtvector.cpp +./test/geo_homogeneous.cpp +./test/determinant.cpp +./test/array_reverse.cpp +./test/unalignedassert.cpp +./test/stdlist.cpp +./test/product_symm.cpp +./test/corners.cpp +./test/dontalign.cpp +./test/visitor.cpp +./test/geo_alignedbox.cpp +./test/diagonalmatrices.cpp +./test/product_small.cpp +./test/eigensolver_generalized_real.cpp +./test/umfpack_support.cpp +./test/first_aligned.cpp +./test/qr_fullpivoting.cpp +./test/array_replicate.cpp +./test/geo_parametrizedline.cpp +./test/eigen2/eigen2_unalignedassert.cpp +./test/eigen2/eigen2_prec_inverse_4x4.cpp +./test/eigen2/eigen2_alignedbox.cpp +./test/eigen2/eigen2_sparse_product.cpp +./test/eigen2/eigen2_meta.cpp +./test/eigen2/eigen2_nomalloc.cpp +./test/eigen2/eigen2_visitor.cpp +./test/eigen2/eigen2_packetmath.cpp +./test/eigen2/eigen2_svd.cpp +./test/eigen2/eigen2_mixingtypes.cpp +./test/eigen2/eigen2_qr.cpp +./test/eigen2/eigen2_cwiseop.cpp +./test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp +./test/eigen2/eigen2_smallvectors.cpp +./test/eigen2/eigen2_commainitializer.cpp +./test/eigen2/eigen2_sparse_solvers.cpp +./test/eigen2/eigen2_hyperplane.cpp +./test/eigen2/eigen2_eigensolver.cpp +./test/eigen2/eigen2_linearstructure.cpp +./test/eigen2/eigen2_sizeof.cpp +./test/eigen2/eigen2_parametrizedline.cpp +./test/eigen2/eigen2_lu.cpp +./test/eigen2/eigen2_adjoint.cpp +./test/eigen2/eigen2_geometry.cpp +./test/eigen2/eigen2_stdvector.cpp +./test/eigen2/eigen2_newstdvector.cpp +./test/eigen2/eigen2_submatrices.cpp +./test/eigen2/sparse.h +./test/eigen2/eigen2_swap.cpp +./test/eigen2/eigen2_triangular.cpp +./test/eigen2/eigen2_basicstuff.cpp +./test/eigen2/gsl_helper.h +./test/eigen2/eigen2_dynalloc.cpp +./test/eigen2/eigen2_array.cpp +./test/eigen2/eigen2_map.cpp +./test/eigen2/main.h +./test/eigen2/eigen2_miscmatrices.cpp +./test/eigen2/eigen2_product_large.cpp +./test/eigen2/eigen2_first_aligned.cpp +./test/eigen2/eigen2_cholesky.cpp +./test/eigen2/eigen2_determinant.cpp +./test/eigen2/eigen2_sum.cpp +./test/eigen2/eigen2_inverse.cpp +./test/eigen2/eigen2_regression.cpp +./test/eigen2/eigen2_product_small.cpp +./test/eigen2/eigen2_qtvector.cpp +./test/eigen2/eigen2_sparse_vector.cpp +./test/eigen2/product.h +./test/eigen2/eigen2_sparse_basic.cpp +./test/eigen2/eigen2_bug_132.cpp +./test/array.cpp +./test/product_syrk.cpp +./test/commainitializer.cpp +./test/conservative_resize.cpp +./test/qr_colpivoting.cpp +./test/nullary.cpp +./test/bandmatrix.cpp +./test/pastix_support.cpp +./test/product.h +./test/block.cpp +./test/vectorization_logic.cpp +./test/jacobi.cpp +./test/diagonal.cpp +./test/schur_complex.cpp +./test/sizeoverflow.cpp +./bench/BenchTimer.h +./bench/benchFFT.cpp +./bench/eig33.cpp +./bench/spbench/spbenchsolver.h +./bench/spbench/spbenchstyle.h +./lapack/complex_double.cpp +./lapack/cholesky.cpp +./lapack/lapack_common.h +./lapack/eigenvalues.cpp +./lapack/single.cpp +./lapack/lu.cpp +./lapack/complex_single.cpp +./lapack/double.cpp +./demos/mix_eigen_and_c/binary_library.cpp +./demos/mix_eigen_and_c/binary_library.h +./demos/mix_eigen_and_c/example.c +./demos/mandelbrot/mandelbrot.cpp +./demos/mandelbrot/mandelbrot.h +./demos/opengl/icosphere.cpp +./demos/opengl/icosphere.h +./demos/opengl/camera.cpp +./demos/opengl/quaternion_demo.h +./demos/opengl/camera.h +./demos/opengl/trackball.h +./demos/opengl/gpuhelper.h +./demos/opengl/trackball.cpp +./demos/opengl/gpuhelper.cpp +./demos/opengl/quaternion_demo.cpp +./debug/gdb/printers.py +./unsupported/test/minres.cpp +./unsupported/test/openglsupport.cpp +./unsupported/test/jacobisvd.cpp +./unsupported/test/dgmres.cpp +./unsupported/test/matrix_square_root.cpp +./unsupported/test/bdcsvd.cpp +./unsupported/test/matrix_exponential.cpp +./unsupported/test/forward_adolc.cpp +./unsupported/test/polynomialsolver.cpp +./unsupported/test/matrix_function.cpp +./unsupported/test/sparse_extra.cpp +./unsupported/test/matrix_functions.h +./unsupported/test/svd_common.h +./unsupported/test/FFTW.cpp +./unsupported/test/alignedvector3.cpp +./unsupported/test/autodiff.cpp +./unsupported/test/gmres.cpp +./unsupported/test/BVH.cpp +./unsupported/test/levenberg_marquardt.cpp +./unsupported/test/matrix_power.cpp +./unsupported/test/kronecker_product.cpp +./unsupported/test/splines.cpp +./unsupported/test/polynomialutils.cpp +./unsupported/bench/bench_svd.cpp +./unsupported/Eigen/IterativeSolvers +./unsupported/Eigen/src/IterativeSolvers/DGMRES.h +./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +./unsupported/Eigen/src/IterativeSolvers/GMRES.h +./unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h +./unsupported/Eigen/src/IterativeSolvers/Scaling.h +./unsupported/Eigen/src/IterativeSolvers/MINRES.h +./unsupported/Eigen/src/SparseExtra/RandomSetter.h +./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +./unsupported/Eigen/src/SparseExtra/MarketIO.h +./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +./unsupported/Eigen/src/BVH/BVAlgorithms.h +./unsupported/Eigen/src/BVH/KdBVH.h +./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +./unsupported/Eigen/src/Splines/Spline.h +./unsupported/Eigen/src/Splines/SplineFitting.h +./unsupported/Eigen/src/Splines/SplineFwd.h +./unsupported/Eigen/src/SVD/JacobiSVD.h +./unsupported/Eigen/src/SVD/BDCSVD.h +./unsupported/Eigen/src/SVD/SVDBase.h +./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +./unsupported/Eigen/src/MatrixFunctions/StemFunction.h +./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +./unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h +./unsupported/Eigen/src/MoreVectorization/MathFunctions.h +./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +./unsupported/Eigen/src/FFT/ei_fftw_impl.h +./unsupported/Eigen/src/FFT/ei_kissfft_impl.h +./unsupported/Eigen/src/Polynomials/PolynomialSolver.h +./unsupported/Eigen/src/Polynomials/Companion.h +./unsupported/Eigen/src/Polynomials/PolynomialUtils.h +./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +./unsupported/Eigen/src/Skyline/SkylineProduct.h +./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +./unsupported/Eigen/src/Skyline/SkylineStorage.h +./unsupported/Eigen/src/Skyline/SkylineUtil.h +./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +./unsupported/Eigen/src/Skyline/SkylineMatrix.h +./unsupported/Eigen/SparseExtra +./unsupported/Eigen/AdolcForward +./unsupported/Eigen/KroneckerProduct +./unsupported/Eigen/NonLinearOptimization +./unsupported/Eigen/BVH +./unsupported/Eigen/OpenGLSupport +./unsupported/Eigen/ArpackSupport +./unsupported/Eigen/AutoDiff +./unsupported/Eigen/Splines +./unsupported/Eigen/MPRealSupport +./unsupported/Eigen/MatrixFunctions +./unsupported/Eigen/MoreVectorization +./unsupported/Eigen/LevenbergMarquardt +./unsupported/Eigen/AlignedVector3 +./unsupported/Eigen/FFT +./unsupported/Eigen/Polynomials +./unsupported/Eigen/NumericalDiff +./unsupported/Eigen/Skyline +./COPYING.README +./COPYING.README +./LICENSE +./LICENSE +./LICENSE +./Eigen/Eigen2Support +./Eigen/src/Eigen2Support/VectorBlock.h +./Eigen/src/Eigen2Support/Cwise.h +./Eigen/src/Eigen2Support/Minor.h +./Eigen/src/Eigen2Support/Lazy.h +./Eigen/src/Eigen2Support/Memory.h +./Eigen/src/Eigen2Support/MathFunctions.h +./Eigen/src/Eigen2Support/Geometry/AlignedBox.h +./Eigen/src/Eigen2Support/Geometry/Hyperplane.h +./Eigen/src/Eigen2Support/Geometry/Quaternion.h +./Eigen/src/Eigen2Support/Geometry/Rotation2D.h +./Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h +./Eigen/src/Eigen2Support/Geometry/RotationBase.h +./Eigen/src/Eigen2Support/Geometry/Translation.h +./Eigen/src/Eigen2Support/Geometry/Scaling.h +./Eigen/src/Eigen2Support/Geometry/AngleAxis.h +./Eigen/src/Eigen2Support/Geometry/Transform.h +./Eigen/src/Eigen2Support/TriangularSolver.h +./Eigen/src/Eigen2Support/LU.h +./Eigen/src/Eigen2Support/QR.h +./Eigen/src/Eigen2Support/SVD.h +./Eigen/src/Eigen2Support/Meta.h +./Eigen/src/Eigen2Support/Block.h +./Eigen/src/Eigen2Support/Macros.h +./Eigen/src/Eigen2Support/LeastSquares.h +./Eigen/src/Eigen2Support/CwiseOperators.h +./Eigen/src/Jacobi/Jacobi.h +./Eigen/src/misc/Kernel.h +./Eigen/src/misc/SparseSolve.h +./Eigen/src/misc/Solve.h +./Eigen/src/misc/Image.h +./Eigen/src/SparseCore/SparseColEtree.h +./Eigen/src/SparseCore/SparseTranspose.h +./Eigen/src/SparseCore/SparseUtil.h +./Eigen/src/SparseCore/SparseCwiseBinaryOp.h +./Eigen/src/SparseCore/SparseDiagonalProduct.h +./Eigen/src/SparseCore/SparseProduct.h +./Eigen/src/SparseCore/SparseDot.h +./Eigen/src/SparseCore/SparseCwiseUnaryOp.h +./Eigen/src/SparseCore/SparseSparseProductWithPruning.h +./Eigen/src/SparseCore/SparseBlock.h +./Eigen/src/SparseCore/SparseDenseProduct.h +./Eigen/src/SparseCore/CompressedStorage.h +./Eigen/src/SparseCore/SparseMatrixBase.h +./Eigen/src/SparseCore/MappedSparseMatrix.h +./Eigen/src/SparseCore/SparseTriangularView.h +./Eigen/src/SparseCore/SparseView.h +./Eigen/src/SparseCore/SparseFuzzy.h +./Eigen/src/SparseCore/TriangularSolver.h +./Eigen/src/SparseCore/SparseSelfAdjointView.h +./Eigen/src/SparseCore/SparseMatrix.h +./Eigen/src/SparseCore/SparseVector.h +./Eigen/src/SparseCore/AmbiVector.h +./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +./Eigen/src/SparseCore/SparseRedux.h +./Eigen/src/SparseCore/SparsePermutation.h +./Eigen/src/Eigenvalues/RealSchur.h +./Eigen/src/Eigenvalues/ComplexEigenSolver.h +./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +./Eigen/src/Eigenvalues/ComplexSchur.h +./Eigen/src/Eigenvalues/RealQZ.h +./Eigen/src/Eigenvalues/EigenSolver.h +./Eigen/src/Eigenvalues/HessenbergDecomposition.h +./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +./Eigen/src/Eigenvalues/Tridiagonalization.h +./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +./Eigen/src/SuperLUSupport/SuperLUSupport.h +./Eigen/src/StlSupport/StdDeque.h +./Eigen/src/StlSupport/StdVector.h +./Eigen/src/StlSupport/StdList.h +./Eigen/src/StlSupport/details.h +./Eigen/src/SparseQR/SparseQR.h +./Eigen/src/LU/Inverse.h +./Eigen/src/LU/arch/Inverse_SSE.h +./Eigen/src/LU/Determinant.h +./Eigen/src/LU/PartialPivLU.h +./Eigen/src/LU/FullPivLU.h +./Eigen/src/UmfPackSupport/UmfPackSupport.h +./Eigen/src/OrderingMethods/Ordering.h +./Eigen/src/OrderingMethods/Eigen_Colamd.h +./Eigen/src/QR/HouseholderQR.h +./Eigen/src/QR/ColPivHouseholderQR.h +./Eigen/src/QR/FullPivHouseholderQR.h +./Eigen/src/SVD/JacobiSVD.h +./Eigen/src/SVD/UpperBidiagonalization.h +./Eigen/src/Geometry/OrthoMethods.h +./Eigen/src/Geometry/AlignedBox.h +./Eigen/src/Geometry/Hyperplane.h +./Eigen/src/Geometry/Quaternion.h +./Eigen/src/Geometry/EulerAngles.h +./Eigen/src/Geometry/Rotation2D.h +./Eigen/src/Geometry/ParametrizedLine.h +./Eigen/src/Geometry/RotationBase.h +./Eigen/src/Geometry/arch/Geometry_SSE.h +./Eigen/src/Geometry/Umeyama.h +./Eigen/src/Geometry/Homogeneous.h +./Eigen/src/Geometry/Translation.h +./Eigen/src/Geometry/Scaling.h +./Eigen/src/Geometry/AngleAxis.h +./Eigen/src/Geometry/Transform.h +./Eigen/src/plugins/BlockMethods.h +./Eigen/src/plugins/CommonCwiseUnaryOps.h +./Eigen/src/plugins/CommonCwiseBinaryOps.h +./Eigen/src/plugins/MatrixCwiseUnaryOps.h +./Eigen/src/plugins/MatrixCwiseBinaryOps.h +./Eigen/src/Householder/Householder.h +./Eigen/src/Householder/HouseholderSequence.h +./Eigen/src/Householder/BlockHouseholder.h +./Eigen/src/Core/VectorBlock.h +./Eigen/src/Core/Matrix.h +./Eigen/src/Core/Ref.h +./Eigen/src/Core/SelfAdjointView.h +./Eigen/src/Core/MathFunctions.h +./Eigen/src/Core/GlobalFunctions.h +./Eigen/src/Core/MapBase.h +./Eigen/src/Core/EigenBase.h +./Eigen/src/Core/GenericPacketMath.h +./Eigen/src/Core/NestByValue.h +./Eigen/src/Core/CwiseUnaryOp.h +./Eigen/src/Core/SolveTriangular.h +./Eigen/src/Core/Fuzzy.h +./Eigen/src/Core/Visitor.h +./Eigen/src/Core/Map.h +./Eigen/src/Core/NoAlias.h +./Eigen/src/Core/Diagonal.h +./Eigen/src/Core/StableNorm.h +./Eigen/src/Core/CoreIterators.h +./Eigen/src/Core/products/Parallelizer.h +./Eigen/src/Core/products/SelfadjointMatrixVector.h +./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +./Eigen/src/Core/products/TriangularSolverMatrix.h +./Eigen/src/Core/products/GeneralMatrixMatrix.h +./Eigen/src/Core/products/SelfadjointProduct.h +./Eigen/src/Core/products/CoeffBasedProduct.h +./Eigen/src/Core/products/TriangularMatrixVector.h +./Eigen/src/Core/products/SelfadjointMatrixMatrix.h +./Eigen/src/Core/products/TriangularSolverVector.h +./Eigen/src/Core/products/SelfadjointRank2Update.h +./Eigen/src/Core/products/GeneralBlockPanelKernel.h +./Eigen/src/Core/products/GeneralMatrixVector.h +./Eigen/src/Core/products/TriangularMatrixMatrix.h +./Eigen/src/Core/Reverse.h +./Eigen/src/Core/BooleanRedux.h +./Eigen/src/Core/Replicate.h +./Eigen/src/Core/arch/AltiVec/PacketMath.h +./Eigen/src/Core/arch/AltiVec/Complex.h +./Eigen/src/Core/arch/SSE/PacketMath.h +./Eigen/src/Core/arch/SSE/Complex.h +./Eigen/src/Core/arch/SSE/MathFunctions.h +./Eigen/src/Core/arch/NEON/PacketMath.h +./Eigen/src/Core/arch/NEON/Complex.h +./Eigen/src/Core/arch/Default/Settings.h +./Eigen/src/Core/CwiseUnaryView.h +./Eigen/src/Core/Array.h +./Eigen/src/Core/ArrayWrapper.h +./Eigen/src/Core/Swap.h +./Eigen/src/Core/Transpositions.h +./Eigen/src/Core/Random.h +./Eigen/src/Core/IO.h +./Eigen/src/Core/SelfCwiseBinaryOp.h +./Eigen/src/Core/VectorwiseOp.h +./Eigen/src/Core/Select.h +./Eigen/src/Core/ArrayBase.h +./Eigen/src/Core/DenseCoeffsBase.h +./Eigen/src/Core/DiagonalProduct.h +./Eigen/src/Core/Assign.h +./Eigen/src/Core/Redux.h +./Eigen/src/Core/ForceAlignedAccess.h +./Eigen/src/Core/BandMatrix.h +./Eigen/src/Core/PlainObjectBase.h +./Eigen/src/Core/DenseBase.h +./Eigen/src/Core/Flagged.h +./Eigen/src/Core/CwiseBinaryOp.h +./Eigen/src/Core/ProductBase.h +./Eigen/src/Core/TriangularMatrix.h +./Eigen/src/Core/Transpose.h +./Eigen/src/Core/DiagonalMatrix.h +./Eigen/src/Core/Dot.h +./Eigen/src/Core/Functors.h +./Eigen/src/Core/PermutationMatrix.h +./Eigen/src/Core/NumTraits.h +./Eigen/src/Core/MatrixBase.h +./Eigen/src/Core/DenseStorage.h +./Eigen/src/Core/util/Memory.h +./Eigen/src/Core/util/StaticAssert.h +./Eigen/src/Core/util/BlasUtil.h +./Eigen/src/Core/util/MatrixMapper.h +./Eigen/src/Core/util/XprHelper.h +./Eigen/src/Core/util/ForwardDeclarations.h +./Eigen/src/Core/util/Meta.h +./Eigen/src/Core/util/Macros.h +./Eigen/src/Core/util/Constants.h +./Eigen/src/Core/CwiseNullaryOp.h +./Eigen/src/Core/Block.h +./Eigen/src/Core/GeneralProduct.h +./Eigen/src/Core/CommaInitializer.h +./Eigen/src/Core/ReturnByValue.h +./Eigen/src/Core/Stride.h +./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +./Eigen/src/SparseLU/SparseLU_column_dfs.h +./Eigen/src/SparseLU/SparseLU_panel_dfs.h +./Eigen/src/SparseLU/SparseLU_relax_snode.h +./Eigen/src/SparseLU/SparseLU_panel_bmod.h +./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +./Eigen/src/SparseLU/SparseLU_Utils.h +./Eigen/src/SparseLU/SparseLU_gemm_kernel.h +./Eigen/src/SparseLU/SparseLU_kernel_bmod.h +./Eigen/src/SparseLU/SparseLU_pivotL.h +./Eigen/src/SparseLU/SparseLU_Memory.h +./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +./Eigen/src/SparseLU/SparseLUImpl.h +./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +./Eigen/src/SparseLU/SparseLU_Structs.h +./Eigen/src/SparseLU/SparseLU.h +./Eigen/src/SparseLU/SparseLU_column_bmod.h +./Eigen/src/SparseLU/SparseLU_pruneL.h +./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +./Eigen/src/SparseCholesky/SimplicialCholesky.h +./Eigen/src/Cholesky/LDLT.h +./Eigen/src/Cholesky/LLT.h +./Eigen/src/CholmodSupport/CholmodSupport.h +./Eigen/src/PaStiXSupport/PaStiXSupport.h +./Eigen/src/MetisSupport/MetisSupport.h +./Eigen/StdVector +./Eigen/Core +./Eigen/SparseLU +./Eigen/StdList +./Eigen/StdDeque +./Eigen/SparseCholesky +./scripts/relicense.py +./scripts/relicense.py +./blas/BandTriangularSolver.h +./blas/PackedTriangularMatrixVector.h +./blas/complex_double.cpp +./blas/level2_real_impl.h +./blas/level1_cplx_impl.h +./blas/level1_impl.h +./blas/level1_real_impl.h +./blas/level3_impl.h +./blas/single.cpp +./blas/level2_cplx_impl.h +./blas/PackedSelfadjointProduct.h +./blas/Rank2Update.h +./blas/complex_single.cpp +./blas/PackedTriangularSolverVector.h +./blas/double.cpp +./blas/common.h +./blas/level2_impl.h +./blas/GeneralRank1Update.h + +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + +---------------------------------------------------------------------- +Following applies to: +./doc/UsingIntelMKL.dox +./doc/UsingIntelMKL.dox +./Eigen/src/Eigenvalues/ComplexSchur_MKL.h +./Eigen/src/Eigenvalues/ComplexSchur_MKL.h +./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h +./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h +./Eigen/src/Eigenvalues/RealSchur_MKL.h +./Eigen/src/Eigenvalues/RealSchur_MKL.h +./Eigen/src/LU/arch/Inverse_SSE.h +./Eigen/src/LU/arch/Inverse_SSE.h +./Eigen/src/LU/PartialPivLU_MKL.h +./Eigen/src/LU/PartialPivLU_MKL.h +./Eigen/src/QR/HouseholderQR_MKL.h +./Eigen/src/QR/HouseholderQR_MKL.h +./Eigen/src/QR/ColPivHouseholderQR_MKL.h +./Eigen/src/QR/ColPivHouseholderQR_MKL.h +./Eigen/src/SVD/JacobiSVD_MKL.h +./Eigen/src/SVD/JacobiSVD_MKL.h +./Eigen/src/PardisoSupport/PardisoSupport.h +./Eigen/src/PardisoSupport/PardisoSupport.h +./Eigen/src/Core/Assign_MKL.h +./Eigen/src/Core/Assign_MKL.h +./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +./Eigen/src/Core/products/GeneralMatrixVector_MKL.h +./Eigen/src/Core/products/GeneralMatrixVector_MKL.h +./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +./Eigen/src/Core/products/TriangularMatrixVector_MKL.h +./Eigen/src/Core/products/TriangularMatrixVector_MKL.h +./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +./Eigen/src/Core/util/MKL_support.h +./Eigen/src/Core/util/MKL_support.h +./Eigen/src/Cholesky/LLT_MKL.h +./Eigen/src/Cholesky/LLT_MKL.h + +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. * + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. * Neither the name of Intel Corporation nor the + names of its contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +---------------------------------------------------------------------- +Following applies to: + everything under ./bench/btl + + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds +of works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, +family, or household purposes, or (2) anything designed or sold for +incorporation into a dwelling. In determining whether a product is a +consumer product, doubtful cases shall be resolved in favor of +coverage. For a particular product received by a particular user, +"normally used" refers to a typical or common use of that class of +product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected +to use, the product. A product is a consumer product regardless of +whether the product has substantial commercial, industrial or +non-consumer uses, unless such uses represent the only significant +mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to +install and execute modified versions of a covered work in that User +Product from a modified version of its Corresponding Source. The +information must suffice to ensure that the continued functioning of +the modified object code is in no case prevented or interfered with +solely because modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include +a requirement to continue to provide support service, warranty, or +updates for a work that has been modified or installed by the +recipient, or for the User Product in which it has been modified or +installed. Access to a network may be denied when the modification +itself materially and adversely affects the operation of the network +or violates the rules and protocols for communication across the +network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material +you add to a covered work, you may (if authorized by the copyright +holders of that material) supplement the terms of this License with +terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions + of it) with contractual assumptions of liability to the recipient, + for any liability that these contractual assumptions directly + impose on those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement +or otherwise) that contradict the conditions of this License, they do +not excuse you from the conditions of this License. If you cannot +convey a covered work so as to satisfy simultaneously your obligations +under this License and any other pertinent obligations, then as a +consequence you may not convey it at all. For example, if you agree +to terms that obligate you to collect a royalty for further conveying +from those to whom you convey the Program, the only way you could +satisfy both those terms and this License would be to refrain entirely +from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT +WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES +AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR +DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL +DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM +(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED +INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF +THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER +OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) This program comes + with ABSOLUTELY NO WARRANTY; for details type `show w'. This is + free software, and you are welcome to redistribute it under + certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the +appropriate parts of the General Public License. Of course, your +program's commands might be different; for a GUI interface, you would +use an "about box". + + You should also get your employer (if you work as a programmer) or +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. For more information on this, and how to apply and follow +the GNU GPL, see . + + The GNU General Public License does not permit incorporating your +program into proprietary programs. If your program is a subroutine +library, you may consider it more useful to permit linking proprietary +applications with the library. If this is what you want to do, use +the GNU Lesser General Public License instead of this License. But +first, please read . + + +---------------------------------------------------------------------- +Following applies to: +./test/metis_support.cpp +./test/sparselu.cpp +./unsupported/test/mpreal/mpreal.h +./unsupported/Eigen/src/IterativeSolvers/IterationController.h +./unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +./unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +./Eigen/src/OrderingMethods/Amd.h +./Eigen/src/SparseCholesky/SimplicialCholesky_impl.h + + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the +GNU General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this + license document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this + license document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of + this License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. + + +---------------------------------------------------------------------- +Following applies to: +./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h + +Minpack Copyright Notice (1999) University of Chicago. All rights +reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. diff --git a/third_party/eigen3/eigen.BUILD b/third_party/eigen3/eigen.BUILD new file mode 100644 index 00000000..0bde8b8a --- /dev/null +++ b/third_party/eigen3/eigen.BUILD @@ -0,0 +1,71 @@ +# Description: +# Eigen is a C++ template library for linear algebra: vectors, +# matrices, and related algorithms. +# This file is mostly stolen from tensorflow. + +licenses([ + # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code. + # We've taken special care to not reference any restricted code. + "reciprocal", # MPL2 + "notice", # Portions BSD +]) + +exports_files(["COPYING.MPL2"]) + +# License-restricted (i.e. not reciprocal or notice) files inside Eigen/... +EIGEN_RESTRICTED_FILES = [ + "Eigen/src/OrderingMethods/Amd.h", + "Eigen/src/SparseCholesky/**", +] + +# Notable transitive dependencies of restricted files inside Eigen/... +EIGEN_RESTRICTED_DEPS = [ + "Eigen/Eigen", + "Eigen/IterativeLinearSolvers", + "Eigen/MetisSupport", + "Eigen/Sparse", + "Eigen/SparseCholesky", + "Eigen/SparseLU", +] + +# Note: unsupported/Eigen is unsupported and might go away at any time. +EIGEN_FILES = [ + "Eigen/**", + "unsupported/Eigen/CXX11/**", + "unsupported/Eigen/FFT", + "unsupported/Eigen/KroneckerProduct", + "unsupported/Eigen/src/FFT/**", + "unsupported/Eigen/src/KroneckerProduct/**", + "unsupported/Eigen/MatrixFunctions", + "unsupported/Eigen/SpecialFunctions", + "unsupported/Eigen/src/SpecialFunctions/**", +] + +# List of files picked up by glob but actually part of another target. +EIGEN_EXCLUDE_FILES = [ + "Eigen/src/Core/arch/AVX/PacketMathGoogleTest.cc", +] + +# Files known to be under MPL2 license. +EIGEN_MPL2_HEADER_FILES = glob( + EIGEN_FILES, + exclude = EIGEN_EXCLUDE_FILES + + EIGEN_RESTRICTED_FILES + + EIGEN_RESTRICTED_DEPS + [ + # Guarantees any file missed by excludes above will not compile. + "Eigen/src/Core/util/NonMPL2.h", + "Eigen/**/CMakeLists.txt", + ], +) + +cc_library( + name = "eigen", + hdrs = EIGEN_MPL2_HEADER_FILES, + defines = [ + # This define (mostly) guarantees we don't link any problematic + # code. We use it, but we do not rely on it, as evidenced above. + "EIGEN_MPL2_ONLY", + ], + includes = ["."], + visibility = ["//visibility:public"], +) diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Core b/third_party/eigen3/unsupported/Eigen/CXX11/Core new file mode 100644 index 00000000..1b369071 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Core @@ -0,0 +1,46 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_CORE_MODULE +#define EIGEN_CXX11_CORE_MODULE + +#include + +#include + +/** \defgroup CXX11_Core_Module C++11 Core Module + * + * This module provides common core features for all modules that + * explicitly depend on C++11. Currently, this is only the Tensor + * module. Note that at this stage, you should not need to include + * this module directly. + * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * + * \code + * #include + * \endcode + */ + +// Only a subset of cxx11 is allowed at Google, so we default to emulate the +// cxx11 functionality that we need. +#include "src/Core/util/FixedSizeVector.h" +#if 1 +#include +#include "src/Core/util/EmulateCXX11Meta.h" +#else +#include "src/Core/util/CXX11Workarounds.h" +#include "src/Core/util/CXX11Meta.h" +#endif +#include + +#endif // EIGEN_CXX11_CORE_MODULE + diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint new file mode 100644 index 00000000..eb604d38 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint @@ -0,0 +1,55 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MODULE +#define EIGEN_CXX11_FIXED_POINT_MODULE + +#include +#include + +/** \defgroup CXX11_FixedPoint_Module Fixed Point Module + * + * This module provides common core features for all modules that + * explicitly depend on C++11. Currently, this is only the Tensor + * module. Note that at this stage, you should not need to include + * this module directly. + * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * + * \code + * #include + * \endcode + */ + +#include "src/FixedPoint/FixedPointTypes.h" + +// Use optimized implementations whenever available +#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW) +#include "src/FixedPoint/PacketMathAVX512.h" +#include "src/FixedPoint/TypeCastingAVX512.h" + +#elif defined EIGEN_VECTORIZE_AVX2 +#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT +#include "src/FixedPoint/PacketMathAVX2.h" +#include "src/FixedPoint/MatMatProductAVX2.h" +#include "src/FixedPoint/TypeCastingAVX2.h" + +#elif defined EIGEN_VECTORIZE_NEON +#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +#include "src/FixedPoint/MatMatProductNEON.h" +#endif + +// Use the default implementation when no optimized code is available +#include "src/FixedPoint/MatMatProduct.h" +#include "src/FixedPoint/MatVecProduct.h" + + +#endif // EIGEN_CXX11_FIXED_POINT_MODULE diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks new file mode 100644 index 00000000..7741b68d --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks @@ -0,0 +1,35 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE +#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE + +#include "unsupported/Eigen/CXX11/Tensor" + +/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module + * + * This module provides an efficient implementation of the common primitives + * used by neural networks. + * The primitives are built on top of the tensor library. + * + * \code + * #include + * \endcode + */ + +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h" +#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h" + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_MODULE diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 00000000..861a87b6 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,15 @@ +#include "unsupported/Eigen/CXX11/Tensor" + +#ifdef _WIN32 +#ifndef SLEEP_FUNC_HEADER_GUARD +#define SLEEP_FUNC_HEADER_GUARD +inline void sleep(unsigned int seconds) { Sleep(1000*seconds); } +#endif + +// On Windows, Eigen will include Windows.h, which defines various +// macros that conflict with TensorFlow symbols. Undefine them here to +// prevent clashes. +#undef DeleteFile +#undef ERROR +#undef LoadLibrary +#endif // _WIN32 diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool b/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool new file mode 100644 index 00000000..d2639af4 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool @@ -0,0 +1 @@ +#include "unsupported/Eigen/CXX11/ThreadPool" diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h new file mode 100644 index 00000000..6b625abc --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h @@ -0,0 +1,342 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H +#define EIGEN_CXX11_FIXED_POINT_TYPES_H + +#include +#include + +namespace Eigen { + +// The mantissa part of the fixed point representation. See +// go/tensorfixedpoint for details +struct QInt8; +struct QUInt8; +struct QInt16; +struct QUInt16; +struct QInt32; + +template <> +struct NumTraits : GenericNumTraits {}; +template <> +struct NumTraits : GenericNumTraits {}; +template <> +struct NumTraits : GenericNumTraits {}; +template <> +struct NumTraits : GenericNumTraits {}; +template <> +struct NumTraits : GenericNumTraits {}; + +namespace internal { +template <> +struct scalar_product_traits { + enum { + // Cost = NumTraits::MulCost, + Defined = 1 + }; + typedef QInt32 ReturnType; +}; +} + +// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent +// the compiler from silently type cast the mantissa into a bigger or a smaller +// representation. +struct QInt8 { + QInt8() {} + QInt8(const int8_t v) : value(v) {} + QInt8(const QInt32 v); + + operator int() const { return static_cast(value); } + + int8_t value; +}; + +struct QUInt8 { + QUInt8() {} + QUInt8(const uint8_t v) : value(v) {} + QUInt8(const QInt32 v); + + operator int() const { return static_cast(value); } + + uint8_t value; +}; + +struct QInt16 { + QInt16() {} + QInt16(const int16_t v) : value(v) {} + QInt16(const QInt32 v); + operator int() const { return static_cast(value); } + + int16_t value; +}; + +struct QUInt16 { + QUInt16() {} + QUInt16(const uint16_t v) : value(v) {} + QUInt16(const QInt32 v); + operator int() const { return static_cast(value); } + + uint16_t value; +}; + +struct QInt32 { + QInt32() {} + QInt32(const int8_t v) : value(v) {} + QInt32(const int32_t v) : value(v) {} + QInt32(const uint32_t v) : value(static_cast(v)) {} + QInt32(const QInt8 v) : value(v.value) {} + QInt32(const float v) : value(static_cast(lrint(v))) {} +#ifdef EIGEN_MAKING_DOCS + // Workaround to fix build on PPC. + QInt32(unsigned long v) : value(v) {} +#endif + + operator float() const { return static_cast(value); } + + int32_t value; +}; + +EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v) + : value(v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value)) {} +EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v) + : value(v.value > 255 ? 255 : (v.value < 0 ? 0 : v.value)) {} +EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v) + : value(v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value)) {} +EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v) + : value(v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value)) {} + +// Basic widening 8-bit operations: This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) { + return QInt32(static_cast(a.value) * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) { + return QInt32(static_cast(a.value) * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) { + return QInt32(static_cast(a.value) + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) { + return QInt32(static_cast(a.value) - static_cast(b.value)); +} + +// Basic widening 16-bit operations: This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) { + return QInt32(static_cast(a.value) * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) { + return QInt32(static_cast(a.value) * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) { + return QInt32(static_cast(a.value) + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) { + return QInt32(static_cast(a.value) - static_cast(b.value)); +} + +// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) { + return QInt32(a.value + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) { + return QInt32(a.value - static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) { + return QInt32(a.value * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) * b.value); +} + +// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) { + return QInt32(a.value + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) { + return QInt32(a.value - static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) { + return QInt32(a.value * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) * b.value); +} + +// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) { + return QInt32(a.value + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) { + return QInt32(a.value - static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) { + return QInt32(a.value * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) { + return QInt32(static_cast(a.value) * b.value); +} + +// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) { + return QInt32(a.value + static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) + b.value); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) { + return QInt32(a.value - static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) - b.value); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) { + return QInt32(a.value * static_cast(b.value)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) { + return QInt32(static_cast(a.value) * b.value); +} + +// Basic arithmetic operations on QInt32, which behaves like a int32_t. +EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) { + return a.value + b.value; +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) { + return a.value - b.value; +} +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) { + return a.value * b.value; +} +EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) { + return a.value / b.value; +} +EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) { + a.value += b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) { + a.value -= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) { + a.value *= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) { + a.value /= b.value; + return a; +} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { + return -a.value; +} + +// Scaling QInt32 by double. We do the arithmetic in double because +// float only has 23 bits of mantissa, so casting QInt32 to float might reduce +// accuracy by discarding up to 7 (least significant) bits. +EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) { + return static_cast(lrint(static_cast(a.value) * b)); +} +EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) { + return static_cast(lrint(a * static_cast(b.value))); +} +EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) { + a.value = static_cast(lrint(static_cast(a.value) * b)); + return a; +} + +// Comparisons +EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) { + return a.value == b.value; +} +EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) { + return a.value == b.value; +} + +EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) { + return a.value < b.value; +} +EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) { + return a.value < b.value; +} + +EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) { + return a.value > b.value; +} +EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) { + return a.value > b.value; +} + +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) { + os << static_cast(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) { + os << static_cast(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) { + os << static_cast(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) { + os << static_cast(a.value); + return os; +} +EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) { + os << a.value; + return os; +} + +} // namespace Eigen + +#endif // EIGEN_CXX11_FIXED_POINT_TYPES_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h new file mode 100644 index 00000000..4d0dca07 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h @@ -0,0 +1,255 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H + + +namespace Eigen { +namespace internal { + +// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent +// overflows +template<> struct scalar_product_traits +{ + enum { + Defined = 1 + }; + typedef QInt32 ReturnType; +}; + +// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits +// to prevent overflows +template<> struct scalar_product_traits +{ + enum { + Defined = 1 + }; + typedef QInt32 ReturnType; +}; + +// Description of the product implementation. It's pretty simple now since +// nothing is vectorized yet. +// This definition tackle the case where both lhs and rhs are encoded using +// signed 8bit integers +#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT + +template +class gebp_traits +{ +public: + typedef QInt8 LhsScalar; + typedef QInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// The signed 8bit Mat-Mat product itself. +template +struct gebp_kernel +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +EIGEN_DONT_INLINE +void gebp_kernel +::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +// This definition tackle the case where the lhs is encoded using signed 8bit +// integers and the rhs using unsigned 8bit integers. +#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT +template +class gebp_traits +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs +template +struct gebp_kernel +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +EIGEN_DONT_INLINE +void gebp_kernel +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +// This definition tackle the case where the khs is encoded using unsigned 8bit +// integers and the rhs using signed 8bit integers. +#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT +template +class gebp_traits +{ +public: + typedef QUInt8 LhsScalar; + typedef QInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + + +// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs +template +struct gebp_kernel +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +EIGEN_DONT_INLINE +void gebp_kernel +::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h new file mode 100644 index 00000000..6b4b0edc --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h @@ -0,0 +1,1754 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// Copyright (C) 2015 Matthew Sarett +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H + +namespace Eigen { +namespace internal { + +// AVX2 optimized implementation of Mat-Mat product. +// LHS is encoded using signed 8-bit integers. +// RHS is encoded using unsigned 8-bit integers. +#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +// Define quantized traits +template +class gebp_traits +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // Define register blocking scheme. + nr = 32, + mr = 32, + kr = 8, + // Ignore progress tracking per loop iteration. + LhsProgress = -1, + RhsProgress = -1 + }; +}; + +// Specialized blocking for quantized implementations. +// Used by TensorContractionThreadPool, inputs must have dimensions that are +// multiples of 32. +template +class TensorContractionBlocking, TensorContractionInputMapper, Index, ShardingType> { + public: + + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + kc_(k), mc_(m), nc_(n) + { + eigen_assert(m % 32 == 0); + eigen_assert(k % 32 == 0); + if (!k || !m || !n) { + return; + } + + if (ShardingType == ShardByCol) { + eigen_assert(n % 32 == 0); + nc_ = (((n / num_threads) + 31) / 32) * 32; + } + else { + eigen_assert(n % 32 == 0 || n == 1); + // Special case to avoid breaking the unimplemented matrix-vector case + if (n == 1) { + nc_ = 32; + } + mc_ = (((m / num_threads) + 31) / 32) * 32; + } + } + + EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + + private: + Index kc_; + Index mc_; + Index nc_; +}; + +// Specialized blocking for quantized implementations. +// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to +// multiples of 32. +template +class gemm_blocking_space + : public level3_blocking { + DenseIndex m_sizeA; + DenseIndex m_sizeB; + + public: + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, + DenseIndex /*num_threads*/, bool /*l3_blocking*/) { + this->m_mc = ((rows + 31) / 32) * 32; + this->m_nc = ((cols + 31) / 32) * 32; + this->m_kc = ((depth + 31) / 32) * 32; + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void allocateA() { + if (this->m_blockA == 0) this->m_blockA = aligned_new(m_sizeA); + } + void allocateB() { + if (this->m_blockB == 0) this->m_blockB = aligned_new(m_sizeB); + } + void allocateAll() { + allocateA(); + allocateB(); + } + ~gemm_blocking_space() { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; + + +template +class gemm_blocking_space + : public level3_blocking { + DenseIndex m_sizeA; + DenseIndex m_sizeB; + + public: + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, + DenseIndex /*num_threads*/, bool /*l3_blocking*/) { + this->m_mc = ((rows + 31) / 32) * 32; + this->m_nc = ((cols + 31) / 32) * 32; + this->m_kc = ((depth + 31) / 32) * 32; + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void allocateA() { + if (this->m_blockA == 0) this->m_blockA = aligned_new(m_sizeA); + } + void allocateB() { + if (this->m_blockB == 0) this->m_blockB = aligned_new(m_sizeB); + } + void allocateAll() { + allocateA(); + allocateB(); + } + ~gemm_blocking_space() { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; + +// Alternate templates for any input sizes +template +struct gemm_pack_lhs_any; +template +struct gemm_pack_lhs_any { + EIGEN_DONT_INLINE void operator() + (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0); +}; + +template +struct gemm_pack_rhs_any; +template +struct gemm_pack_rhs_any { + EIGEN_DONT_INLINE void operator() + (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0); +}; + +template +struct gebp_kernel_any; +template +struct gebp_kernel_any +{ + typedef typename DataMapper::LinearMapper LinearMapper; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +// Alternate implementations for any input sizes +template +EIGEN_DONT_INLINE void gemm_pack_lhs_any:: +operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Get vector pointer + __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA); + + // Get even multiples of the dimensions + Index rows_32 = (rows / 32) * 32; + Index depth_8 = (depth / 8) * 8; + + // Get padding for when depth is not a multiple of 32 + int padding = 0; + if (depth % 32 != 0) { + int depth_32 = (depth / 32) * 32; + int extra_depth = depth - depth_32; + int extra_depth_8 = ((extra_depth + 7) / 8) * 8; + padding = 32 - extra_depth_8; + } + + // Pack rows in sets of 32 + for (Index m = 0; m < rows_32; m += 32) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth_8; k += 8) { + // Load vectors + __m256i L_A = lhs.loadPacket(m, k); + __m256i L_B = lhs.loadPacket(m, k + 1); + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_C = lhs.loadPacket(m, k + 2); + __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_E = lhs.loadPacket(m, k + 4); + __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_G = lhs.loadPacket(m, k + 6); + __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + + // Finish the k dimension, padding with zeros + if (depth_8 < depth) { + __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; + switch (depth - depth_8) { + case 1: + L_A = lhs.loadPacket(m, depth_8); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 2: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 3: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 4: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 5: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 6: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = lhs.loadPacket(m, depth_8 + 5); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 7: + L_A = lhs.loadPacket(m, depth_8); + L_B = lhs.loadPacket(m, depth_8 + 1); + L_C = lhs.loadPacket(m, depth_8 + 2); + L_D = lhs.loadPacket(m, depth_8 + 3); + L_E = lhs.loadPacket(m, depth_8 + 4); + L_F = lhs.loadPacket(m, depth_8 + 5); + L_G = lhs.loadPacket(m, depth_8 + 6); + L_H = _mm256_setzero_si256(); + break; + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + blockA_256 += padding; + } + + // Finish the m dimension, padding with zeros + if (rows_32 < rows) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth_8; k += 8) { + // Load vectors + __m256i L_A = _mm256_setzero_si256(); + __m256i L_B = _mm256_setzero_si256(); + __m256i L_C = _mm256_setzero_si256(); + __m256i L_D = _mm256_setzero_si256(); + __m256i L_E = _mm256_setzero_si256(); + __m256i L_F = _mm256_setzero_si256(); + __m256i L_G = _mm256_setzero_si256(); + __m256i L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + QInt8* ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, k); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, k + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, k + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, k + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, k + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, k + 5); + ptr = (QInt8*) &L_G; + ptr[m] = lhs(rows_32 + m, k + 6); + ptr = (QInt8*) &L_H; + ptr[m] = lhs(rows_32 + m, k + 7); + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + + // Finish the k dimension, padding with zeros + if (depth_8 < depth) { + __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; + QInt8* ptr; + switch (depth - depth_8) { + case 1: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + QInt8* ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + } + break; + case 2: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + } + break; + case 3: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + } + break; + case 4: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + } + break; + case 5: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + } + break; + case 6: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + } + break; + case 7: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*) &L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*) &L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*) &L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*) &L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*) &L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*) &L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + ptr = (QInt8*) &L_G; + ptr[m] = lhs(rows_32 + m, depth_8 + 6); + } + break; + } + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + } +} + +template +EIGEN_DONT_INLINE void gemm_pack_rhs_any:: +operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Get vector pointer + __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB); + + // Get even multiples of the dimensions + Index cols_32 = (cols / 32) * 32; + Index depth_32 = (depth / 32) * 32; + + // Perform a step of the packing for 4 columns + __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24; +#define PACK_STEP \ + R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \ + R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \ + R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \ + R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \ + R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \ + R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \ + R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \ + R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \ + _mm256_store_si256(blockB_256, R_AD_0); \ + _mm256_store_si256(blockB_256 + 8, R_AD_8); \ + _mm256_store_si256(blockB_256 + 16, R_AD_16); \ + _mm256_store_si256(blockB_256 + 24, R_AD_24); \ + blockB_256++; + + // Pack cols in sets of 32 + for (Index n = 0; n < cols_32; n += 32) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth_32; k += 32) { + __m256i R_A = rhs.loadPacket(k, n); + __m256i R_B = rhs.loadPacket(k, n + 1); + __m256i R_C = rhs.loadPacket(k, n + 2); + __m256i R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 4); + R_B = rhs.loadPacket(k, n + 5); + R_C = rhs.loadPacket(k, n + 6); + R_D = rhs.loadPacket(k, n + 7); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 8); + R_B = rhs.loadPacket(k, n + 9); + R_C = rhs.loadPacket(k, n + 10); + R_D = rhs.loadPacket(k, n + 11); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 12); + R_B = rhs.loadPacket(k, n + 13); + R_C = rhs.loadPacket(k, n + 14); + R_D = rhs.loadPacket(k, n + 15); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 16); + R_B = rhs.loadPacket(k, n + 17); + R_C = rhs.loadPacket(k, n + 18); + R_D = rhs.loadPacket(k, n + 19); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 20); + R_B = rhs.loadPacket(k, n + 21); + R_C = rhs.loadPacket(k, n + 22); + R_D = rhs.loadPacket(k, n + 23); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 24); + R_B = rhs.loadPacket(k, n + 25); + R_C = rhs.loadPacket(k, n + 26); + R_D = rhs.loadPacket(k, n + 27); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 28); + R_B = rhs.loadPacket(k, n + 29); + R_C = rhs.loadPacket(k, n + 30); + R_D = rhs.loadPacket(k, n + 31); + PACK_STEP; + + blockB_256 += 24; + } + + if (depth_32 < depth) { + QUInt8* ptr; + __m256i R_A = _mm256_setzero_si256(); + __m256i R_B = _mm256_setzero_si256(); + __m256i R_C = _mm256_setzero_si256(); + __m256i R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 3); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 4); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 5); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 6); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 7); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 8); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 9); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 10); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 11); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 12); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 13); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 14); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 15); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 16); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 17); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 18); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 19); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 20); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 21); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 22); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 23); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 24); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 25); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 26); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 27); + } + PACK_STEP; + + R_A = _mm256_setzero_si256(); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n + 28); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 29); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 30); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 31); + } + PACK_STEP; + blockB_256 += 24; + } + } + + // Finish packing cols + if (cols_32 < cols) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth_32; k += 32) { + __m256i R_A, R_B, R_C, R_D; + Index n; + for (n = cols_32; n < cols; n += 4) { + switch (cols - n) { + case 1: + R_A = rhs.loadPacket(k, n); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 2: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 3: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = rhs.loadPacket(k, n + 2); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + default: + R_A = rhs.loadPacket(k, n); + R_B = rhs.loadPacket(k, n + 1); + R_C = rhs.loadPacket(k, n + 2); + R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + break; + } + } + + // Increment the block pointer. + // We must pad if cols is not a multiple of 32. + blockB_256 += 32 - (n - cols_32) / 4; + } + + if (depth_32 < depth) { + for (Index n = cols_32; n < cols; n += 4) { + QUInt8* ptr; + __m256i R_A = _mm256_setzero_si256(); + __m256i R_B = _mm256_setzero_si256(); + __m256i R_C = _mm256_setzero_si256(); + __m256i R_D = _mm256_setzero_si256(); + switch (cols - n) { + case 1: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + } + PACK_STEP; + break; + case 2: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + } + PACK_STEP; + break; + case 3: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + } + PACK_STEP; + break; + default: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*) &R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*) &R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*) &R_C; + ptr[k - depth_32] = rhs(k, n + 2); + ptr = (QUInt8*) &R_D; + ptr[k - depth_32] = rhs(k, n + 3); + } + PACK_STEP; + break; + } + } + } + } +#undef PACK_STEP +} + +template +EIGEN_DONT_INLINE +void gebp_kernel_any +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + Index rows_32 = ((rows + 31) / 32) * 32; + Index cols_32 = ((cols + 31) / 32) * 32; + Index depth_32 = ((depth + 31) / 32) * 32; + + // Create result block + ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0); + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + + // Get vectorized pointers + __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO); + const __m256i* blockA_256 = reinterpret_cast(blockA); + const __m256i* blockB_256 = reinterpret_cast(blockB); + + // Loop over blocks of 32 columns + for (Index n = 0; n < cols_32; n += 32) { + // Reset index into blockA + Index indexL = 0; + // Loop over blocks of 32 rows + for (Index m = 0; m < rows_32; m += 32) { + // Reset index into blockB + Index indexR = n / 32 * depth_32; + // Loop over blocks of 8 on depth + for (Index k = 0; k < depth_32; k += 8) { + // Load inputs + __m256i L_AD0 = blockA_256[indexL++]; + __m256i L_AD8 = blockA_256[indexL++]; + __m256i L_AD16 = blockA_256[indexL++]; + __m256i L_AD24 = blockA_256[indexL++]; + __m256i L_EH0 = blockA_256[indexL++]; + __m256i L_EH8 = blockA_256[indexL++]; + __m256i L_EH16 = blockA_256[indexL++]; + __m256i L_EH24 = blockA_256[indexL++]; + __m256i R_AH0 = blockB_256[indexR++]; + __m256i R_AH4 = blockB_256[indexR++]; + __m256i R_AH8 = blockB_256[indexR++]; + __m256i R_AH12 = blockB_256[indexR++]; + __m256i R_AH16 = blockB_256[indexR++]; + __m256i R_AH20 = blockB_256[indexR++]; + __m256i R_AH24 = blockB_256[indexR++]; + __m256i R_AH28 = blockB_256[indexR++]; + + // This constant is used with madd to convert 16 bit to 32 bit + const __m256i ONE = _mm256_set1_epi32(0x00010001); + + // Declare variables used in COMPUTE_STEP + __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32; + +#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 1, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 2, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 3, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32)); + + // Permute and shuffle to copy a single value across the entire vector + // Then compute the multiplication + __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00); + __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 0); + __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 1); + R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11); + __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 2); + __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 3); + + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 4); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 5); + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 6); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 7); + + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 8); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 9); + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 10); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 11); + + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 12); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 13); + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 14); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 15); + + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 16); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 17); + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 18); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 19); + + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 20); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 21); + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 22); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 23); + + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 24); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 25); + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 26); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 27); + + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 28); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 29); + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 30); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 31); + +#undef COMPUTE_STEP + } + + // Transfer the results to the result matrix. + if (m + 32 <= rows && n + 32 <= cols) { + Index i = 0; + for (Index j = n; j < n + 32; j++) { + LinearMapper r0 = res.getLinearMapper(m, j); + LinearMapper r1 = res.getLinearMapper(m + 8, j); + LinearMapper r2 = res.getLinearMapper(m + 16, j); + LinearMapper r3 = res.getLinearMapper(m + 24, j); + r0.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0))); + r1.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0))); + r2.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0))); + r3.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0))); + } + } + else { + for (Index j = n; j < cols; j++) { + for (Index i = m; i < rows; i++) { + res(i, j) = blockO[(j - n) * 32 + (i - m)]; + } + } + } + + // Zero the result block so it can be reused + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + } + } +} + +// Below are the fully optimized versions that are correct only for sizes that +// are multiple of 32. It is about a 10% performance benefit to keep these +// implementations separate. + +// Arrange a block of the left input matrix in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... +// A1 B1 C1 D1 E1 F1 G1 H1 ... +// A2 B2 C2 D2 E2 F2 G2 H2 ... +// A3 B3 C3 D3 E3 F3 G3 H3 ... +// A4 B4 C4 D4 E4 F4 G4 H4 ... +// A5 B5 C5 D5 E5 F5 G5 H5 ... +// A6 B6 C6 D6 E6 F6 G6 H6 ... +// A7 B7 C7 D7 E7 F7 G7 H7 ... +// A8 ... +// ... +// +// Packing yields output (A0 beside B0 in memory): +// A0 B0 C0 D0 +// A1 B1 C1 D1 +// A2 B2 C2 D2 +// A3 B3 C3 D3 +// A4 B4 C4 D4 +// A5 B5 C5 D5 +// A6 B6 C6 D6 +// A7 B7 C7 D7 +// ... +// A31 B31 C31 D31 +// E0 F0 G0 H0 +// E1 F1 G1 H1 +// E2 F2 G2 H2 +// E3 F3 G3 H3 +// E4 F4 G4 H4 +// E5 F5 G5 H5 +// E6 F6 G6 H6 +// E7 F7 G7 H7 +// ... +// +// Four elements of the same row are arranged contiguously because maddubs and +// madd both perform an adjacent addition in the kernel. +template +struct gemm_pack_lhs { + EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs, + Index depth, Index rows, Index stride = 0, + Index offset = 0); +}; + +template +EIGEN_DONT_INLINE void gemm_pack_lhs:: +operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, + Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Use alternate function for weird sizes + if (rows % 32 != 0 || depth % 32 != 0) { + gemm_pack_lhs_any lhs_pack; + return lhs_pack(blockA, lhs, depth, rows, stride, offset); + } + + // Get vector pointer + __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA); + + // Pack rows in sets of 32 + for (Index m = 0; m < rows; m += 32) { + // Pack depth in sets of 8 + for (Index k = 0; k < depth; k += 8) { + // Load vectors + __m256i L_A = lhs.loadPacket(m, k); + __m256i L_B = lhs.loadPacket(m, k + 1); + + // Interleave 8-bit elements + __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); + __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); + + __m256i L_C = lhs.loadPacket(m, k + 2); + __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); + __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); + + // Interleave 16-bit elements + __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16); + __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16); + + // Use permute before we store to cross 128-bit lanes + __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20); + _mm256_store_si256(blockA_256++, L_AD0); + + // Complete packing for 32 x 8 block + __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31); + __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24); + __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20); + _mm256_store_si256(blockA_256++, L_AD8); + _mm256_store_si256(blockA_256++, L_AD16); + __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); + _mm256_store_si256(blockA_256++, L_AD24); + __m256i L_E = lhs.loadPacket(m, k + 4); + __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); + __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); + __m256i L_G = lhs.loadPacket(m, k + 6); + __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); + __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); + __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16); + __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20); + _mm256_store_si256(blockA_256++, L_EH0); + __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31); + __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24); + __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20); + _mm256_store_si256(blockA_256++, L_EH8); + _mm256_store_si256(blockA_256++, L_EH16); + __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31); + _mm256_store_si256(blockA_256++, L_EH24); + } + } +} + +// Arrange a block of the right input matrix in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... +// A1 B1 C1 D1 E1 F1 G1 H1 ... +// A2 B2 C2 D2 E2 F2 G2 H2 ... +// A3 B3 C3 D3 E3 F3 G3 H3 ... +// A4 B4 C4 D4 E4 F4 G4 H4 ... +// A5 B5 C5 D5 E5 F5 G5 H5 ... +// A6 B6 C6 D6 E6 F6 G6 H6 ... +// A7 B7 C7 D7 E7 F7 G7 H7 ... +// A8 ... +// ... +// +// Packing yields row major output (A0 beside A1 in memory): +// A0 A1 A2 A3 A4 A5 A6 A7 +// B0 B1 B2 B3 B4 B5 B6 B7 +// ... +// +// At least four elements of the same col are arranged contiguously because +// maddubs and madd both perform an adjacent addition in the kernel. We can +// save work by leaving 8 adjacent elements because kr = 8. +template +struct gemm_pack_rhs { + EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0); +}; + +template +EIGEN_DONT_INLINE void gemm_pack_rhs:: +operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, + Index stride, Index offset) { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + // Use alternate function for weird sizes + if (cols % 32 != 0 || depth % 32 != 0) { + gemm_pack_rhs_any rhs_pack; + return rhs_pack(blockB, rhs, depth, cols, stride, offset); + } + + // Get vector pointer + __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB); + + // Perform a step of the packing for 4 columns + __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24; +#define PACK_STEP \ + R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \ + R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \ + R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \ + R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \ + R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \ + R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \ + R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \ + R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \ + _mm256_store_si256(blockB_256, R_AD_0); \ + _mm256_store_si256(blockB_256 + 8, R_AD_8); \ + _mm256_store_si256(blockB_256 + 16, R_AD_16); \ + _mm256_store_si256(blockB_256 + 24, R_AD_24); \ + blockB_256++; + + // Pack cols in sets of 32 + for (Index n = 0; n < cols; n += 32) { + // Pack depth in sets of 32 + for (Index k = 0; k < depth; k += 32) { + __m256i R_A = rhs.loadPacket(k, n); + __m256i R_B = rhs.loadPacket(k, n + 1); + __m256i R_C = rhs.loadPacket(k, n + 2); + __m256i R_D = rhs.loadPacket(k, n + 3); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 4); + R_B = rhs.loadPacket(k, n + 5); + R_C = rhs.loadPacket(k, n + 6); + R_D = rhs.loadPacket(k, n + 7); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 8); + R_B = rhs.loadPacket(k, n + 9); + R_C = rhs.loadPacket(k, n + 10); + R_D = rhs.loadPacket(k, n + 11); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 12); + R_B = rhs.loadPacket(k, n + 13); + R_C = rhs.loadPacket(k, n + 14); + R_D = rhs.loadPacket(k, n + 15); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 16); + R_B = rhs.loadPacket(k, n + 17); + R_C = rhs.loadPacket(k, n + 18); + R_D = rhs.loadPacket(k, n + 19); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 20); + R_B = rhs.loadPacket(k, n + 21); + R_C = rhs.loadPacket(k, n + 22); + R_D = rhs.loadPacket(k, n + 23); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 24); + R_B = rhs.loadPacket(k, n + 25); + R_C = rhs.loadPacket(k, n + 26); + R_D = rhs.loadPacket(k, n + 27); + PACK_STEP; + + R_A = rhs.loadPacket(k, n + 28); + R_B = rhs.loadPacket(k, n + 29); + R_C = rhs.loadPacket(k, n + 30); + R_D = rhs.loadPacket(k, n + 31); + PACK_STEP; + + blockB_256 += 24; + } + } +#undef PACK_STEP +} + +// Perform the actual multiplication on packed inputs +template +struct gebp_kernel +{ + typedef typename DataMapper::LinearMapper LinearMapper; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +EIGEN_DONT_INLINE +void gebp_kernel +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + // Use alternate function for weird sizes + if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) { + gebp_kernel_any gebp; + return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + + // Create result block + QInt32* blockO = aligned_new(32 * 32); + // Allocating the result block is about 5-10% faster than declaring stack + // space. It is unclear why this is the case. + // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0); + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + + // Get vectorized pointers + __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO); + const __m256i* blockA_256 = reinterpret_cast(blockA); + const __m256i* blockB_256 = reinterpret_cast(blockB); + + // Loop over blocks of 32 columns + for (Index n = 0; n < cols; n += 32) { + // Reset index into blockA + Index indexL = 0; + // Loop over blocks of 32 rows + for (Index m = 0; m < rows; m += 32) { + // Reset index into blockB + Index indexR = n / 32 * depth; + // Loop over blocks of 8 on depth + for (Index k = 0; k < depth; k += 8) { + // Load inputs + __m256i L_AD0 = blockA_256[indexL++]; + __m256i L_AD8 = blockA_256[indexL++]; + __m256i L_AD16 = blockA_256[indexL++]; + __m256i L_AD24 = blockA_256[indexL++]; + __m256i L_EH0 = blockA_256[indexL++]; + __m256i L_EH8 = blockA_256[indexL++]; + __m256i L_EH16 = blockA_256[indexL++]; + __m256i L_EH24 = blockA_256[indexL++]; + __m256i R_AH0 = blockB_256[indexR++]; + __m256i R_AH4 = blockB_256[indexR++]; + __m256i R_AH8 = blockB_256[indexR++]; + __m256i R_AH12 = blockB_256[indexR++]; + __m256i R_AH16 = blockB_256[indexR++]; + __m256i R_AH20 = blockB_256[indexR++]; + __m256i R_AH24 = blockB_256[indexR++]; + __m256i R_AH28 = blockB_256[indexR++]; + + // This constant is used with madd to convert 16 bit to 32 bit + const __m256i ONE = _mm256_set1_epi32(0x00010001); + + // Declare variables used in COMPUTE_STEP + __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32; + +#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 1, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 2, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \ + \ + P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \ + P_32_A = _mm256_madd_epi16(P_16_A, ONE); \ + P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \ + P_32_B = _mm256_madd_epi16(P_16_B, ONE); \ + P_32 = _mm256_add_epi32(P_32_A, P_32_B); \ + _mm256_store_si256( \ + blockO_256 + 4 * OFFSET + 3, \ + _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32)); + + // Permute and shuffle to copy a single value across the entire vector + // Then compute the multiplication + __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00); + __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 0); + __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 1); + R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11); + __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 2); + __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 3); + + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 4); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 5); + R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 6); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 7); + + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 8); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 9); + R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 10); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 11); + + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 12); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 13); + R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 14); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 15); + + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 16); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 17); + R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 18); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 19); + + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 20); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 21); + R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 22); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 23); + + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 24); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 25); + R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 26); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 27); + + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00); + R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD0, R_EH0, 28); + R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD1, R_EH1, 29); + R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11); + R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00); + R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55); + COMPUTE_STEP(R_AD2, R_EH2, 30); + R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA); + R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF); + COMPUTE_STEP(R_AD3, R_EH3, 31); + +#undef COMPUTE_STEP + } + + // Transfer the results to the result matrix + Index i = 0; + for (Index j = n; j < n + 32; j++) { + LinearMapper r0 = res.getLinearMapper(m, j); + LinearMapper r1 = res.getLinearMapper(m + 8, j); + LinearMapper r2 = res.getLinearMapper(m + 16, j); + LinearMapper r3 = res.getLinearMapper(m + 24, j); + r0.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0))); + r1.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0))); + r2.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0))); + r3.storePacket( + 0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0))); + } + + // Zero the result block so it can be reused + memset(blockO, 0, 32 * 32 * sizeof(QInt32)); + } + } + aligned_delete(blockO, 32 * 32); +} + +#endif // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h new file mode 100644 index 00000000..99894caf --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H +#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H + + +namespace Eigen { +namespace internal { + + +// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit +// integers and the rhs using unsigned 8bit integers. +#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT + +template +class gebp_traits +{ +public: + typedef QInt8 LhsScalar; + typedef QUInt8 RhsScalar; + typedef QInt32 ResScalar; + + enum { + // register block size along the M and N directions + // One for the current implementation + nr = 1, + mr = 1, + // Progress made at each iteration of the product loop + // also 1 for the current implementation + LhsProgress = 1, + RhsProgress = 1 + }; +}; + +// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs +template +struct gebp_kernel +{ + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +EIGEN_DONT_INLINE +void gebp_kernel +::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(alpha.value == 1); + eigen_assert(strideA == -1); + eigen_assert(strideB == -1); + eigen_assert(offsetA == 0); + eigen_assert(offsetB == 0); + + eigen_assert(rows > 0); + eigen_assert(cols > 0); + eigen_assert(depth > 0); + eigen_assert(blockA); + eigen_assert(blockB); + + for (Index j = 0; j < cols; ++j) { + Index startB = j * depth; + + for (Index i = 0; i < rows; ++i) { + Index startA = i * depth; + + for (Index k = 0; k < depth; ++k) { + res(i, j) += blockA[startA + k] * blockB[startB + k]; + } + } + } +} +#endif + + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h new file mode 100644 index 00000000..18b5085b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h @@ -0,0 +1,123 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H +#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H + + +namespace Eigen { +namespace internal { + +// Mat-Vec product +// Both lhs and rhs are encoded as 8bit signed integers +template +struct general_matrix_vector_product +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha); +}; + +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + + +// Mat-Vec product +// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers +template +struct general_matrix_vector_product +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QUInt8 alpha); +}; + +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QUInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + + +// Mat-Vec product +// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers +template +struct general_matrix_vector_product +{ +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha); +}; + +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + QInt32* res, Index resIncr, + QInt8 alpha) +{ + eigen_assert(alpha.value == 1); + eigen_assert(resIncr == 1); + eigen_assert(rows > 0); + eigen_assert(cols > 0); + + for (Index i = 0; i < rows; ++i) { + for (Index j = 0; j < cols; ++j) { + res[i] += lhs(i, j) * rhs(j, 0); + } + } +} + +} // namespace internal +} // namespace Eigen + + + +#endif // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h new file mode 100644 index 00000000..078be83e --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -0,0 +1,476 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ + +namespace Eigen { +namespace internal { + +typedef struct Packet32q8i { + __m256i val; + operator __m256i() const { return val; } + Packet32q8i(); + Packet32q8i(__m256i val) : val(val) {} +} Packet32q8i; + +typedef struct Packet16q16i { + __m256i val; + operator __m256i() const { return val; } + Packet16q16i(); + Packet16q16i(__m256i val) : val(val) {} +} Packet16q16i; + +typedef struct Packet32q8u { + __m256i val; + operator __m256i() const { return val; } + Packet32q8u(); + Packet32q8u(__m256i val) : val(val) {} +} Packet32q8u; + +typedef struct Packet16q8i { + __m128i val; + operator __m128i() const { return val; } + Packet16q8i(); + Packet16q8i(__m128i val) : val(val) {} +} Packet16q8i; + +typedef struct Packet16q8u { + __m128i val; + operator __m128i() const { return val; } + Packet16q8u(); + Packet16q8u(__m128i val) : val(val) {} +} Packet16q8u; + +typedef struct Packet8q16i { + __m128i val; + operator __m128i() const { return val; } + Packet8q16i(); + Packet8q16i(__m128i val) : val(val) {} +} Packet8q16i; + +typedef struct Packet8q32i { + __m256i val; + operator __m256i() const { return val; } + Packet8q32i(); + Packet8q32i(__m256i val) : val(val) {} +} Packet8q32i; + +typedef struct Packet4q32i { + __m128i val; + operator __m128i() const { return val; } + Packet4q32i(); + Packet4q32i(__m128i val) : val(val) {} +} Packet4q32i; + +#ifndef EIGEN_VECTORIZE_AVX512 +template <> +struct packet_traits : default_packet_traits { + typedef Packet32q8i type; + typedef Packet16q8i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet32q8u type; + typedef Packet16q8u half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet16q16i type; + typedef Packet8q16i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet8q32i type; + typedef Packet4q32i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + }; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet16q8i half; + enum { size = 32, alignment=Aligned32 }; +}; +template <> +struct unpacket_traits { + typedef QInt16 type; + typedef Packet8q16i half; + enum { size = 16, alignment=Aligned32 }; +}; +template <> +struct unpacket_traits { + typedef QUInt8 type; + typedef Packet16q8u half; + enum { size = 32, alignment=Aligned32 }; +}; +template <> +struct unpacket_traits { + typedef QInt32 type; + typedef Packet4q32i half; + enum { size = 8, alignment=Aligned32 }; +}; + +// Unaligned load +template <> +EIGEN_STRONG_INLINE Packet32q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u ploadu(const QUInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q16i ploadu(const QInt16* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i ploadu(const QInt32* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} + +// Aligned load +template <> +EIGEN_STRONG_INLINE Packet32q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pload(const QUInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q16i pload(const QInt16* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pload(const QInt32* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} + +// Unaligned store +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet32q8u& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet16q16i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet8q32i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} + +// Aligned store +template <> +EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet8q32i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet16q16i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet32q8u& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} + +// Extract first element. +template <> +EIGEN_STRONG_INLINE QInt32 pfirst(const Packet8q32i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} +template <> +EIGEN_STRONG_INLINE QInt16 pfirst(const Packet16q16i& a) { + return _mm256_extract_epi16(a.val, 0); +} +template <> +EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet32q8u& a) { + return static_cast(_mm256_extract_epi8(a.val, 0)); +} +template <> +EIGEN_STRONG_INLINE QInt8 pfirst(const Packet32q8i& a) { + return _mm256_extract_epi8(a.val, 0); +} + +// Initialize to constant value. +template <> +EIGEN_STRONG_INLINE Packet32q8i pset1(const QInt8& from) { + return _mm256_set1_epi8(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pset1(const QUInt8& from) { + return _mm256_set1_epi8(static_cast(from.value)); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pset1(const QInt32& from) { + return _mm256_set1_epi32(from.value); +} + +// Basic arithmetic packet ops for QInt32. +template <> +EIGEN_STRONG_INLINE Packet8q32i padd(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_add_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { + return _mm256_set1_epi16(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i psub(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_sub_epi32(a.val, b.val); +} +// Note: mullo truncates the result to 32 bits. +template <> +EIGEN_STRONG_INLINE Packet8q32i pmul(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_mullo_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pnegate(const Packet8q32i& a) { + return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); +} + +// Min and max. +template <> +EIGEN_STRONG_INLINE Packet8q32i pmin(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_min_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet8q32i pmax(const Packet8q32i& a, + const Packet8q32i& b) { + return _mm256_max_epi32(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet16q16i pmin(const Packet16q16i& a, + const Packet16q16i& b) { + return _mm256_min_epi16(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q16i pmax(const Packet16q16i& a, + const Packet16q16i& b) { + return _mm256_max_epi16(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8u pmin(const Packet32q8u& a, + const Packet32q8u& b) { + return _mm256_min_epu8(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8u pmax(const Packet32q8u& a, + const Packet32q8u& b) { + return _mm256_max_epu8(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8i pmin(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_min_epi8(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i pmax(const Packet32q8i& a, + const Packet32q8i& b) { + return _mm256_max_epi8(a.val, b.val); +} + +// Reductions. +template <> +EIGEN_STRONG_INLINE QInt32 predux_min(const Packet8q32i& a) { + __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return pfirst( + _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); +} +template <> +EIGEN_STRONG_INLINE QInt32 predux_max(const Packet8q32i& a) { + __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return pfirst( + _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); +} + +template <> +EIGEN_STRONG_INLINE QInt16 predux_min(const Packet16q16i& a) { + __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); + return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1)); +} +template <> +EIGEN_STRONG_INLINE QInt16 predux_max(const Packet16q16i& a) { + __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); + return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1)); +} + +template <> +EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet32q8u& a) { + __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_min_epu8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::min(static_cast(_mm256_extract_epi8(tmp, 0)), + static_cast(_mm256_extract_epi8(tmp, 1))); +} +template <> +EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet32q8u& a) { + __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_max_epu8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::max(static_cast(_mm256_extract_epi8(tmp, 0)), + static_cast(_mm256_extract_epi8(tmp, 1))); +} + +template <> +EIGEN_STRONG_INLINE QInt8 predux_min(const Packet32q8i& a) { + __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::min(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1)); +} +template <> +EIGEN_STRONG_INLINE QInt8 predux_max(const Packet32q8i& a) { + __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); + tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + return std::max(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1)); +} + +// Vectorized scaling of Packet32q8i by float. +template<> +struct scalar_product_op : binary_op_base { + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) +#else + scalar_product_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; } + + EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const { + __m256d scale = _mm256_set1_pd(b); + __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a)); + __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo)); + __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1)); + __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); + } +}; + +template <> +struct functor_traits> { + enum { Cost = 4 * NumTraits::MulCost, PacketAccess = true }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h new file mode 100644 index 00000000..7a222fdd --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h @@ -0,0 +1,545 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ + +#include "PacketMathAVX2.h" + +namespace Eigen { +namespace internal { + +typedef struct Packet64q8i { + __m512i val; + operator __m512i() const { return val; } + Packet64q8i(); + Packet64q8i(__m512i val) : val(val) {} +} Packet64q8i; + +typedef struct Packet32q16i { + __m512i val; + operator __m512i() const { return val; } + Packet32q16i(); + Packet32q16i(__m512i val) : val(val) {} +} Packet32q16i; + +typedef struct Packet64q8u { + __m512i val; + operator __m512i() const { return val; } + Packet64q8u(); + Packet64q8u(__m512i val) : val(val) {} +} Packet64q8u; + +typedef struct Packet16q32i { + __m512i val; + operator __m512i() const { return val; } + Packet16q32i(); + Packet16q32i(__m512i val) : val(val) {} +} Packet16q32i; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet64q8i type; + typedef Packet32q8i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 64, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet64q8u type; + typedef Packet32q8u half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 64, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet32q16i type; + typedef Packet16q16i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet16q32i type; + typedef Packet8q32i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + }; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet32q8i half; + enum { size = 64, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits { + typedef QInt16 type; + typedef Packet16q16i half; + enum { size = 32, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits { + typedef QUInt8 type; + typedef Packet32q8u half; + enum { size = 64, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits { + typedef QInt32 type; + typedef Packet8q32i half; + enum { size = 16, alignment=Aligned64 }; +}; + +// Unaligned load +template <> +EIGEN_STRONG_INLINE Packet64q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q16i ploadu(const QInt16* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet64q8u ploadu(const QUInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i ploadu(const QInt32* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast(from)); +} + +// Aligned load +template <> +EIGEN_STRONG_INLINE Packet64q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32q16i pload(const QInt16* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet64q8u pload(const QUInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i pload(const QInt32* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast(from)); +} + +// Unaligned store +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet64q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet32q16i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet64q8u& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from.val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet16q32i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from.val); +} + +// Aligned store +template <> +EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet16q32i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet64q8u& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet64q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), + from.val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet32q16i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), + from.val); +} + +// Extract first element. +template <> +EIGEN_STRONG_INLINE QInt32 pfirst(const Packet16q32i& a) { + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0)); +} +template <> +EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet64q8u& a) { + return static_cast( + _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); +} +template <> +EIGEN_STRONG_INLINE QInt8 pfirst(const Packet64q8i& a) { + return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0); +} +template <> +EIGEN_STRONG_INLINE QInt16 pfirst(const Packet32q16i& a) { + return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0); +} + +// Initialize to constant value. +template <> +EIGEN_STRONG_INLINE Packet64q8i pset1(const QInt8& from) { + return _mm512_set1_epi8(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet32q16i pset1(const QInt16& from) { + return _mm512_set1_epi16(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet64q8u pset1(const QUInt8& from) { + return _mm512_set1_epi8(static_cast(from.value)); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i pset1(const QInt32& from) { + return _mm512_set1_epi32(from.value); +} + +// Basic arithmetic packet ops for QInt32. +template <> +EIGEN_STRONG_INLINE Packet16q32i padd(const Packet16q32i& a, + const Packet16q32i& b) { + return _mm512_add_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i psub(const Packet16q32i& a, + const Packet16q32i& b) { + return _mm512_sub_epi32(a.val, b.val); +} +// Note: mullo truncates the result to 32 bits. +template <> +EIGEN_STRONG_INLINE Packet16q32i pmul(const Packet16q32i& a, + const Packet16q32i& b) { + return _mm512_mullo_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i pnegate(const Packet16q32i& a) { + return _mm512_sub_epi32(_mm512_setzero_si512(), a.val); +} + +// Min and max. +template <> +EIGEN_STRONG_INLINE Packet16q32i pmin(const Packet16q32i& a, + const Packet16q32i& b) { + return _mm512_min_epi32(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q32i pmax(const Packet16q32i& a, + const Packet16q32i& b) { + return _mm512_max_epi32(a.val, b.val); +} + +template <> +EIGEN_STRONG_INLINE Packet64q8u pmin(const Packet64q8u& a, + const Packet64q8u& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_min_epu8(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_min_epu8(ap0, bp0); + __m256i r1 = _mm256_min_epu8(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet64q8u pmax(const Packet64q8u& a, + const Packet64q8u& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_max_epu8(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_max_epu8(ap0, bp0); + __m256i r1 = _mm256_max_epu8(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet64q8i pmin(const Packet64q8i& a, + const Packet64q8i& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_min_epi8(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_min_epi8(ap0, bp0); + __m256i r1 = _mm256_min_epi8(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet32q16i pmin(const Packet32q16i& a, + const Packet32q16i& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_min_epi16(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_min_epi16(ap0, bp0); + __m256i r1 = _mm256_min_epi16(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet64q8i pmax(const Packet64q8i& a, + const Packet64q8i& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_max_epi8(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_max_epi8(ap0, bp0); + __m256i r1 = _mm256_max_epi8(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet32q16i pmax(const Packet32q16i& a, + const Packet32q16i& b) { +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_max_epi16(a.val, b.val); +#else + __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i r0 = _mm256_max_epi16(ap0, bp0); + __m256i r1 = _mm256_max_epi16(ap1, bp1); + return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); +#endif +} + +// Reductions. +template <> +EIGEN_STRONG_INLINE QInt32 predux_min(const Packet16q32i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); + res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst( + _mm_min_epi32( + res, + _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE QInt32 predux_max(const Packet16q32i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); + res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst( + _mm_max_epi32( + res, + _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE QInt16 predux_min(const Packet32q16i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); + res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min({ + static_cast(w >> 16), + static_cast(w) + }); +} +template <> +EIGEN_STRONG_INLINE QInt16 predux_max(const Packet32q16i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); + res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::max({ + static_cast(w >> 16), + static_cast(w) + }); +} +template <> +EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet64q8u& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); + res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min({ + static_cast(w >> 24), + static_cast(w >> 16), + static_cast(w >> 8), + static_cast(w) + }); +} +template <> +EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet64q8u& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); + res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::max({ + static_cast(w >> 24), + static_cast(w >> 16), + static_cast(w >> 8), + static_cast(w) + }); +} +template <> +EIGEN_STRONG_INLINE QInt8 predux_min(const Packet64q8i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); + res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min({ + static_cast(w >> 24), + static_cast(w >> 16), + static_cast(w >> 8), + static_cast(w) + }); +} +template <> +EIGEN_STRONG_INLINE QInt8 predux_max(const Packet64q8i& a) { + Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i res = + _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); + res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); + std::uint32_t w = + pfirst( + _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min({ + static_cast(w >> 24), + static_cast(w >> 16), + static_cast(w >> 8), + static_cast(w) + }); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h new file mode 100644 index 00000000..045384d7 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h @@ -0,0 +1,66 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ + +namespace Eigen { +namespace internal { + +typedef __m256 Packet8f; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet8f pcast(const Packet8q32i& a) { + return _mm256_cvtepi32_ps(a.val); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet8q32i pcast(const Packet8f& a) { + return _mm256_cvtps_epi32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast(const Packet8q32i& a, const Packet8q32i& b, + const Packet8q32i& c, const Packet8q32i& d) { + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val), + _mm256_packs_epi32(c.val, d.val)); + // Since packs does not cross 128 bit lane boundaries, + // we have to permute to properly order the final result. + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8u +pcast(const Packet8q32i& a, const Packet8q32i& b, + const Packet8q32i& c, const Packet8q32i& d) { + const __m256i converted = _mm256_packus_epi16( + _mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val)); + // Since packus does not cross 128 bit lane boundaries, + // we have to permute to properly order the final result. + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h new file mode 100644 index 00000000..cd7120ec --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h @@ -0,0 +1,180 @@ +#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ +#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ + +namespace Eigen { +namespace internal { + +typedef __m512 Packet16f; +typedef __m512i Packet16i; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet16f pcast(const Packet16q32i& a) { + return _mm512_cvtepi32_ps(a.val); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet16q32i pcast(const Packet16f& a) { + return _mm512_cvtps_epi32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q16i +pcast(const Packet16f& a, const Packet16f& b) { + Packet16i a_int = _mm512_cvtps_epi32(a); + Packet16i b_int = _mm512_cvtps_epi32(b); +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_packs_epi32(a_int, b_int); +#else + Packet8i ab_int16_low = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_castsi512_si256(a_int), + _mm512_castsi512_si256(b_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i ab_int16_high = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_extracti32x8_epi32(a_int, 1), + _mm512_extracti32x8_epi32(b_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + return _mm512_inserti32x8( + _mm512_castsi256_si512(ab_int16_low), + ab_int16_high, 1); +#endif +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet64q8i +pcast(const Packet16f& a, + const Packet16f& b, + const Packet16f& c, + const Packet16f& d) { + Packet16i a_int = _mm512_cvtps_epi32(a); + Packet16i b_int = _mm512_cvtps_epi32(b); + Packet16i c_int = _mm512_cvtps_epi32(c); + Packet16i d_int = _mm512_cvtps_epi32(d); +#ifdef EIGEN_VECTORIZE_AVX512BW + return _mm512_packs_epi16( + _mm512_packs_epi32(a_int, b_int), + _mm512_packs_epi32(c_int, d_int)); +#else + Packet8i ab_int16_low = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_castsi512_si256(a_int), + _mm512_castsi512_si256(b_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i cd_int16_low = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_castsi512_si256(c_int), + _mm512_castsi512_si256(d_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i ab_int16_high = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_extracti32x8_epi32(a_int, 1), + _mm512_extracti32x8_epi32(b_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i cd_int16_high = + _mm256_permute4x64_epi64( + _mm256_packs_epi32( + _mm512_extracti32x8_epi32(c_int, 1), + _mm512_extracti32x8_epi32(d_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i abcd_int8_low = + _mm256_permute4x64_epi64( + _mm256_packs_epi16(ab_int16_low, cd_int16_low), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i abcd_int8_high = + _mm256_permute4x64_epi64( + _mm256_packs_epi16(ab_int16_high, cd_int16_high), + _MM_SHUFFLE(0, 2, 1, 3)); + return _mm512_inserti32x8( + _mm512_castsi256_si512(abcd_int8_low), + abcd_int8_high, 1); +#endif +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet64q8i +pcast(const Packet16q32i& a, + const Packet16q32i& b, + const Packet16q32i& c, + const Packet16q32i& d) { + __m512i converted = _mm512_packs_epi16(_mm512_packs_epi32(a.val, b.val), + _mm512_packs_epi32(c.val, d.val)); + return converted; +} + +template <> +EIGEN_STRONG_INLINE Packet32q16i +pcast(const Packet16q32i& a, + const Packet16q32i& b) { + __m512i converted = _mm512_packs_epi32(a.val, b.val); + return converted; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet64q8u +pcast(const Packet16q32i& a, const Packet16q32i& b, + const Packet16q32i& c, const Packet16q32i& d) { + const __m512i converted = _mm512_packus_epi16( + _mm512_packus_epi32(a.val, b.val), _mm512_packus_epi32(c.val, d.val)); + return converted; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; + +#if 0 +template <> +EIGEN_STRONG_INLINE Packet32q16u +pcast(const Packet16q32i& a, + const Packet16q32i& b) { + const __m512i converted = _mm512_packus_epi32(a.val, b.val); + return converted; +} +#endif + +} // end namespace internal +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h new file mode 100644 index 00000000..cbcce9e2 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H + +namespace Eigen { + +/** scalar_sigmoid_fast_derivative_op + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a sigmoid + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() + */ +template +struct scalar_sigmoid_fast_derivative_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const { + const T one = T(1); + return (one - y) * y; + } + + template + inline Packet packetOp(const Packet& y) const { + const Packet one = internal::pset1(1); + return internal::pmul(internal::psub(one, y), y); + } +}; + +namespace internal { +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost, + PacketAccess = packet_traits::HasAdd && packet_traits::HasMul && + packet_traits::HasNegate + }; +}; +} // namespace internal + +/** scalar_tanh_fast_derivative_op + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a tanh + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() + */ +template +struct scalar_tanh_fast_derivative_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const { + const T one = T(1); + return one - (y * y); + } + + template + inline Packet packetOp(const Packet& y) const { + const Packet one = internal::pset1(1); + return internal::psub(one, internal::pmul(y, y)); + } +}; + +namespace internal { +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 1, + PacketAccess = packet_traits::HasAdd && packet_traits::HasMul && + packet_traits::HasNegate + }; +}; +} // namespace internal + +/** + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to clip the magnitude of the first scalar. + * + * \sa class CwiseBinaryOp, MatrixBase::Clip + */ +template +struct scalar_clip_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::mini(numext::maxi(a, -b), b); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& a, const Packet& b) const { + return internal::pmin(internal::pmax(a, internal::pnegate(b)), b); + } +}; + +namespace internal { +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 3, + PacketAccess = packet_traits::HasMax && + packet_traits::HasMin && + packet_traits::HasNegate + }; +}; +} // namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h new file mode 100644 index 00000000..d4bc7a35 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h @@ -0,0 +1,209 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H +#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H + +namespace Eigen { + +/** ExtractGlimpses + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Extract glimpses from an input tensor. + * + * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch). + * The width and height parameters specify the extension of the returned glimpses. + * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension. + * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension. + * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center. + * + * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch). + * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size. + */ +namespace { +template +struct GlimpseExtractionOp { + GlimpseExtractionOp(const Index width, const Index height, + const std::vector >& offsets, + const bool normalized, + const bool centered, + const bool uniform_noise) : + width_(width), height_(height), offsets_(offsets), + normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { } + + template + DSizes dimensions(const Input& input) const { + typedef typename internal::traits::Index IndexType; + typedef TensorRef::Scalar, 4, + internal::traits::Layout, IndexType> > Ref; + Ref in(input); + + DSizes dims = in.dimensions(); + + dims[0] = in.dimension(0); + dims[1] = width_; + dims[2] = height_; + dims[3] = in.dimension(3); + return dims; + } + + template + EIGEN_DEVICE_FUNC + void eval(const Input& input, Output& output, const Device& device) const + { + typedef typename internal::traits::Index IndexType; + typedef TensorRef::Scalar, 4, + internal::traits::Layout, IndexType> > Ref; + Ref in(input); + + const Index num_channels = in.dimension(0); + const Index input_width = in.dimension(1); + const Index input_height = in.dimension(2); + const Index batch_size = in.dimension(3); + eigen_assert(input_width > 0); + eigen_assert(input_height > 0); + + for (Index i = 0; i < batch_size; ++i) { + float x = offsets_[i].first, y = offsets_[i].second; + + // Un-normalize coordinates back to pixel space if normalized. + if (normalized_) { + x *= input_width; + y *= input_height; + } + // Un-center if coordinates are centered on the image center. + if (centered_) { + x /= 2.0f; + y /= 2.0f; + x += input_width / 2.0f; + y += input_height / 2.0f; + } + // Remove half of the glimpse window. + x -= width_ / 2.0f; + y -= height_ / 2.0f; + + const Index offset_x = (Index) x; + const Index offset_y = (Index) y; + Index glimpse_width = width_; + Index glimpse_height = height_; + bool partial_overlap = false; + DSizes slice_offset(0, offset_x, offset_y); + DSizes slice_extent(num_channels, width_, height_); + DSizes base_offset(0, 0, 0); + + if (offset_x < 0) { + slice_offset[1] = 0; + glimpse_width = (std::max)(0, width_ + offset_x); + slice_extent[1] = glimpse_width; + base_offset[1] = width_ - glimpse_width; + partial_overlap = true; + } else if (offset_x + width_ >= input_width) { + glimpse_width = (std::max)(0, input_width - offset_x); + slice_extent[1] = glimpse_width; + partial_overlap = true; + } + if (offset_y < 0) { + slice_offset[2] = 0; + glimpse_height = (std::max)(0, height_ + offset_y); + slice_extent[2] = glimpse_height; + base_offset[2] = height_ - glimpse_height; + partial_overlap = true; + } else if (offset_y + height_ >= input_height) { + glimpse_height = (std::max)(0, input_height - offset_y); + slice_extent[2] = glimpse_height; + partial_overlap = true; + } + slice_extent[1] = std::min(input_width, slice_extent[1]); + slice_extent[2] = std::min(input_height, slice_extent[2]); + + if (partial_overlap) { + if (uniform_noise_) { + // Initialize the glimpse with uniform noise. + typedef typename internal::remove_const< + typename internal::traits::Scalar>::type Scalar; + TensorFixedSize > mini; + mini.device(device) = input.template chip<3>(i).minimum(); + TensorFixedSize > range; + range.device(device) = + (input.template chip<3>(i).maximum() - mini).template cast(); + + DSizes glimpse_size(num_channels, width_, height_); + TensorMap > tmp(NULL, glimpse_size); + output.template chip<3>(i).device(device) = + mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) + + (tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast(); + } else { + // Initialize the glimpse with white noise: compute the mean and sigma + // of each channel, and use them to shape the gaussian. + DSizes glimpse_size(width_, height_); + DSizes input_size(input_width, input_height); + typedef typename internal::remove_const< + typename internal::traits::Scalar>::type Scalar; + + for (int j = 0; j < num_channels; ++j) { + TensorFixedSize > mean; + mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast().mean(); + TensorFixedSize > sigma; + sigma.device(device) = + (input.template chip<3>(i).template chip<0>(j).template cast() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt(); + TensorFixedSize > mini; + mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum(); + TensorFixedSize > maxi; + maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum(); + + TensorMap > tmp(NULL, glimpse_size); + output.template chip<3>(i).template chip<0>(j).device(device) = + (mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) + + (tmp.random(internal::NormalRandomGenerator()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size)); + } + } + + // Copy the part of the glimpse that cover the input image if any. + if (glimpse_width == 0 || glimpse_height == 0) { + continue; + } + output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent); + } else { + output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent); + } + } + } + + private: + const Index width_; + const Index height_; + const std::vector > offsets_; + const bool normalized_; + const bool centered_; + const bool uniform_noise_; +}; +} + + +template +EIGEN_ALWAYS_INLINE +static const TensorCustomUnaryOp::Index>, const Input> +ExtractGlimpses(const Input& input, + const typename internal::traits::Index width, + const typename internal::traits::Index height, + const std::vector >& offsets, + const bool normalized = true, const bool centered = true, + const bool uniform_noise = true) +{ + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits::Index Index; + const GlimpseExtractionOp op(width, height, offsets, normalized, + centered, uniform_noise); + return input.customOp(op); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h new file mode 100644 index 00000000..12ce2344 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h @@ -0,0 +1,523 @@ +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H + +#include "Patch3d.h" + +namespace Eigen { + +/** CuboidConvolutionBackwardInput + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 3D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others) + * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width) + * output_backward and kernel have to be in the same layout. + * + * The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + * All dimension orders above are given for col-major, and should be reversed for row-major. + */ + +template +EIGEN_ALWAYS_INLINE static const typename internal::conditional< + internal::traits::Layout == ColMajor, + TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorContractionOp< + const array< IndexPair::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits::Index, 3>, + const TensorReverseOp, const Kernel> + >, + const TensorReshapingOp< + const DSizes< typename internal::traits::Index, 3>, + const TensorVolumePatchOp + > + > + >, + TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorContractionOp< + const array< IndexPair::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits::Index, 3>, + const TensorVolumePatchOp + >, + const TensorReshapingOp< + const DSizes::Index, 3>, + const TensorReverseOp, const Kernel> + > + > + > +>::type +CuboidConvolutionBackwardInput( + const Kernel& kernel, const OutputBackward& output_backward, + typename internal::traits::Index inputPlanes, + typename internal::traits::Index inputRows, + typename internal::traits::Index inputCols, + const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1) { + typedef typename internal::traits::Index TensorIndex; + const TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > kern(kernel); + const TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + static const int NumDims = internal::traits::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3]; + const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0]; + + const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2]; + const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3]; + const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4]; + + TensorIndex forward_pad_z, forward_pad_y, forward_pad_x; + const TensorIndex size_z = ceil(inputPlanes / static_cast(stridePlanes)); + const TensorIndex size_y = ceil(inputRows / static_cast(strideRows)); + const TensorIndex size_x = ceil(inputCols / static_cast(strideCols)); + + // Infer padding type. + if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) { + // SAME padding. + const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows; + const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols; + + forward_pad_z = dz - dz / 2; + forward_pad_y = dy - dy / 2; + forward_pad_x = dx - dx / 2; + } else { + // VALID padding. + forward_pad_z = 0; + forward_pad_y = 0; + forward_pad_x = 0; + } + const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z; + const TensorIndex padding_top = kernelRows - 1 - forward_pad_y; + const TensorIndex padding_left = kernelCols - 1 - forward_pad_x; + + const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop; + const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left; + + eigen_assert(padding_ztop >= 0); + eigen_assert(padding_zbottom >= 0); + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols. + // We need to reverse the kernel along the spatial dimensions. + array kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + kernel_reverse[4] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = true; + kernel_reverse[3] = false; + kernel_reverse[4] = false; + } + + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelRows * kernelCols * kernelPlanes; + } else { + kernel_dims[0] = kernelRows * kernelCols * kernelPlanes; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelFilters; + } + + // The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward, it will have dimensions: + // out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS) + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[2] = inputRows * inputCols * inputPlanes; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[2] *= out.dimension(i); + } + } else { + pre_contract_dims[2] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[0] = inputRows * inputCols * inputPlanes; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // We will contract along dimensions (0, 2) in kernel and (0, 1) in + // output_backward, if this is col-major, and + // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major. + array, 2> contract_dims; + if (isColMajor) { + // col-major: kernel.contract(output.patches) + contract_dims[0] = IndexPair(0, 0); + contract_dims[1] = IndexPair(2, 1); + } else { + // row-major: output.patches.contract(kernel) + contract_dims[0] = IndexPair(1, 0); + contract_dims[1] = IndexPair(2, 2); + } + + // Post contraction, the dimensions of the input_backprop is + // channels X input_planes X input_rows X input_cols X OTHERS + DSizes post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelChannels; + post_contract_dims[1] = inputPlanes; + post_contract_dims[2] = inputRows; + post_contract_dims[3] = inputCols; + for (int i = 4; i < NumDims; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelChannels; + post_contract_dims[NumDims - 2] = inputPlanes; + post_contract_dims[NumDims - 3] = inputRows; + post_contract_dims[NumDims - 4] = inputCols; + for (int i = 0; i < NumDims - 4; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } + + DSizes strides; + for (int i = 0; i < NumDims; i++) { + strides[i] = 1; + } + if (isColMajor) { + strides[1] = stridePlanes; + strides[2] = strideRows; + strides[3] = strideCols; + } else { + strides[NumDims - 2] = stridePlanes; + strides[NumDims - 3] = strideRows; + strides[NumDims - 4] = strideCols; + } + + return choose( + Cond::Layout == ColMajor>(), + kernel.reverse(kernel_reverse) + .reshape(kernel_dims) + .contract( + output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols, + 1, 1, 1, stridePlanes, strideRows, strideCols, + padding_ztop, padding_zbottom, + padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims), + contract_dims) + .reshape(post_contract_dims), + output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols, + 1, 1, 1, stridePlanes, strideRows, strideCols, + padding_ztop, padding_zbottom, + padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims) + .contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), + contract_dims) + .reshape(post_contract_dims)); +} + + +/** CuboidConvolutionBackwardKernel + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 3D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width) + * output_backward and kernel have to be in the same layout. + * + * The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + * All dimension orders above are given for col-major, and should be reversed for row-major. + */ +template +EIGEN_ALWAYS_INLINE static const typename internal::conditional< + internal::traits::Layout == ColMajor, + const TensorShufflingOp< + const array::Index, 5>, + const TensorReverseOp< + const array, + const TensorReshapingOp< + const DSizes::Index, 5>, + const TensorContractionOp< + const array< IndexPair::Index>, 2>, + const TensorReshapingOp< + const DSizes::Index, 3>, + const Input>, + const TensorReshapingOp< + const DSizes< typename internal::traits::Index, 4>, + const TensorVolumePatchOp + > + > + > + > + >, + const TensorShufflingOp< + const array::Index, 5>, + const TensorReverseOp< + const array, + const TensorReshapingOp< + const DSizes::Index, 5>, + const TensorContractionOp< + const array< IndexPair::Index>, 2>, + const TensorReshapingOp< + const DSizes< typename internal::traits::Index, 4>, + const TensorVolumePatchOp + >, + const TensorReshapingOp< + const DSizes::Index, 3>, + const Input + > + > + > + > + > +>::type +CuboidConvolutionBackwardKernel( + const Input& input, const OutputBackward& output_backward, + typename internal::traits::Index kernelPlanes, + typename internal::traits::Index kernelRows, + typename internal::traits::Index kernelCols, + const DenseIndex stridePlanes = 1, + const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1) { + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + static const int NumDims = internal::traits::NumDimensions; + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == internal::traits::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2); + const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3); + const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4); + + const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1); + const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1); + + TensorIndex forward_pad_z, forward_pad_y, forward_pad_x; + const TensorIndex size_z = ceil(inputPlanes / static_cast(stridePlanes)); + const TensorIndex size_y = ceil(inputRows / static_cast(strideRows)); + const TensorIndex size_x = ceil(inputCols / static_cast(strideCols)); + + // Infer padding type. + if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) { + // SAME padding. + const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows; + const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols; + + forward_pad_z = dz - dz / 2; + forward_pad_y = dy - dy / 2; + forward_pad_x = dx - dx / 2; + } else { + // VALID padding. + forward_pad_z = 0; + forward_pad_y = 0; + forward_pad_x = 0; + } + + const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z; + const TensorIndex padding_top = kernelRows - 1 - forward_pad_y; + const TensorIndex padding_left = kernelCols - 1 - forward_pad_x; + + const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop; + const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left; + + eigen_assert(padding_ztop >= 0); + eigen_assert(padding_zbottom >= 0); + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward (with input as the + // kernel), it will have dimensions + // (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = inputRows * inputCols * inputPlanes; + pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[3] = 1; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[3] *= out.dimension(i); + } + } else { + pre_contract_dims[3] = kernelFilters; + pre_contract_dims[2] = inputRows * inputCols * inputPlanes; + pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes; + pre_contract_dims[0] = 1; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS + DSizes input_dims; + if (isColMajor) { + input_dims[0] = kernelChannels; + input_dims[1] = inputRows * inputCols * inputPlanes; + input_dims[2] = 1; + for (int i = 4; i < NumDims; ++i) { + input_dims[2] *= in.dimension(i); + } + eigen_assert(input_dims[2] == pre_contract_dims[3]); + } else { + input_dims[2] = kernelChannels; + input_dims[1] = inputRows * inputCols * inputPlanes; + input_dims[0] = 1; + for (int i = 0; i < NumDims - 4; ++i) { + input_dims[0] *= in.dimension(i); + } + eigen_assert(input_dims[0] == pre_contract_dims[0]); + } + + // We will contract along dimensions (1, 2) in in and (1, 3) in out, if + // this is col-major. + // For row-major, it's dimensions (0, 1) in in and (0, 2) in out. + array, 2> contract_dims; + if (isColMajor) { + // col-major: in.contract(output.patches) + contract_dims[0] = IndexPair(1, 1); + contract_dims[1] = IndexPair(2, 3); + } else { + // row-major: output.patches.contract(in) + contract_dims[0] = IndexPair(0, 0); + contract_dims[1] = IndexPair(2, 1); + } + + // After the contraction, the kernel will have dimension + // in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols + // We will need to shuffle the first two dimensions and reverse the spatial dimensions. + // The end shape is: + // out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols + + // This is the shape of the kernel *before* the shuffling. + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelChannels; + kernel_dims[1] = kernelFilters; + kernel_dims[2] = kernelPlanes; + kernel_dims[3] = kernelRows; + kernel_dims[4] = kernelCols; + } else { + kernel_dims[0] = kernelCols; + kernel_dims[1] = kernelRows; + kernel_dims[2] = kernelPlanes; + kernel_dims[3] = kernelFilters; + kernel_dims[4] = kernelChannels; + } + + // Flip filters and channels. + array kernel_shuffle; + if (isColMajor) { + kernel_shuffle[0] = 1; + kernel_shuffle[1] = 0; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 3; + kernel_shuffle[4] = 4; + } else { + kernel_shuffle[0] = 0; + kernel_shuffle[1] = 1; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 4; + kernel_shuffle[4] = 3; + } + + // Reverse the spatial dimensions. + array kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + kernel_reverse[4] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = true; + kernel_reverse[3] = false; + kernel_reverse[4] = false; + } + + DSizes strides; + for (int i = 0; i < NumDims; i++) { + strides[i] = 1; + } + if (isColMajor) { + strides[1] = stridePlanes; + strides[2] = strideRows; + strides[3] = strideCols; + } else { + strides[NumDims - 2] = stridePlanes; + strides[NumDims - 3] = strideRows; + strides[NumDims - 4] = strideCols; + } + return choose( + Cond::Layout == ColMajor>(), + input.reshape(input_dims) + .contract( + output_backward.extract_volume_patches( + inputPlanes, inputRows, inputCols, 1, + 1, 1, stridePlanes, strideRows, strideCols, + + padding_ztop, padding_zbottom, padding_top, + padding_bottom, padding_left, padding_right) + .reshape(pre_contract_dims), + contract_dims) + .reshape(kernel_dims) + .reverse(kernel_reverse) + .shuffle(kernel_shuffle), + output_backward.extract_volume_patches( + inputPlanes, inputRows, inputCols, 1, 1, 1, + stridePlanes, strideRows, strideCols, padding_ztop, + padding_zbottom, padding_top, padding_bottom, + padding_left, padding_right) + .reshape(pre_contract_dims) + .contract(input.reshape(input_dims), contract_dims) + .reshape(kernel_dims) + .reverse(kernel_reverse) + .shuffle(kernel_shuffle)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h new file mode 100644 index 00000000..188dc75b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H + +namespace Eigen { + +/** SpatialConvolutionBackwardInput + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width) + * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout. + * + * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ + +template +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits::Layout == ColMajor, + TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorContractionOp::Index>, 2>, const TensorReshapingOp::Index, 3>, const TensorReverseOp, const Kernel> >, const TensorReshapingOp::Index, 3>, const TensorImagePatchOp > > >, + TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorContractionOp::Index>, 2>, const TensorReshapingOp::Index, 3>, const TensorImagePatchOp >, const TensorReshapingOp::Index, 3>, const TensorReverseOp, const Kernel> > > > >::type +SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits::Index inputRows, typename internal::traits::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) { + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > kern(kernel); + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + static const int NumDims = internal::traits::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0]; + + // This is the effective kernel size, taking into account the (in_stride - 1) zero-values + // inserted between consecutive kernel elements in atrous convolution + const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2); + const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3); + + // Computing the forward padding + const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2; + const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2; + + const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top; + const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left; + const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left; + + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The kernel has dimensions filters X channels X patch_rows X patch_cols + // We need to reverse the kernel along dimensions corresponding to rows and + // cols. + // TODO(yangke): we can make things slightly faster by collapsing the dimensions + // where we don't reverse. Try that once we have a faster compiler. + array kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = false; + kernel_reverse[3] = false; + } + + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelRows * kernelCols; + kernel_dims[1] = kernelChannels; + kernel_dims[2] = kernelFilters; + } + + // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward, it will have dimensions + // out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS) + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[2] = inputRows * inputCols; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[2] *= out.dimension(i); + } + } else { + pre_contract_dims[2] = kernelFilters; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[0] = inputRows * inputCols; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // We will contract along dimensions (0, 2) in kernel and (0, 1) in + // output_backward, if this is col-major, and + // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major. + array, 2> contract_dims; + if (isColMajor) { + // col-major: kernel.contract(output.patches) + contract_dims[0] = IndexPair(0, 0); + contract_dims[1] = IndexPair(2, 1); + } else { + // row-major: output.patches.contract(kernel) + contract_dims[0] = IndexPair(1, 0); + contract_dims[1] = IndexPair(2, 2); + } + + // Post contraction, the dimensions of the input_backprop is + // channels X input_rows X input_cols X OTHERS + DSizes post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelChannels; + post_contract_dims[1] = inputRows; + post_contract_dims[2] = inputCols; + for (int i = 3; i < NumDims; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelChannels; + post_contract_dims[NumDims - 2] = inputRows; + post_contract_dims[NumDims - 3] = inputCols; + for (int i = 0; i < NumDims - 3; ++i) { + post_contract_dims[i] = out.dimension(i); + } + } + + return choose(Cond::Layout == ColMajor>(), + kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims), + output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims)); +} + + +/** SpatialConvolutionBackwardKernel + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width) + * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout. + * + * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ +// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape(). +// This can significantly accelerate SpatialConvolutionBackwardKernel. + +template +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits::Layout == ColMajor, + const TensorShufflingOp::Index, 4>, const TensorReverseOp, const TensorReshapingOp::Index, 4>, const TensorContractionOp::Index>, 2>, const TensorReshapingOp::Index, 3>, const Input>, const TensorReshapingOp::Index, 4>, const TensorReshapingOp::Index, 4>, const TensorImagePatchOp > > > > > >, + const TensorShufflingOp::Index, 4>, const TensorReverseOp, const TensorReshapingOp::Index, 4>, const TensorContractionOp::Index>, 2>, const TensorReshapingOp::Index, 4>, const TensorReshapingOp::Index, 4>, const TensorImagePatchOp > >, const TensorReshapingOp::Index, 3>, const Input> > > > > >::type +SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits::Index kernelRows, typename internal::traits::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) { + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > out(output_backward); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + + // stride and in_stride cannot both be larger than 1 + eigen_assert(!(stride > 1 && in_stride > 1)); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + static const int NumDims = internal::traits::NumDimensions; + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == internal::traits::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + + const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2); + const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3); + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1]; + + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1]; + + // This is the effective kernel size, taking into account the (in_stride - 1) zero-values + // inserted between consecutive kernel elements in atrous convolution + const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + // Computing the forward padding + const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2; + const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2; + + // TODO: factor out the padding computation. + const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top; + const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left; + const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top; + const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left; + + eigen_assert(padding_top >= 0); + eigen_assert(padding_left >= 0); + eigen_assert(padding_bottom >= 0); + eigen_assert(padding_right >= 0); + + // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS + // When we extract the image patches from output_backward (with input as the + // kernel), it will have dimensions + // (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelFilters; + pre_contract_dims[1] = inputRows * inputCols; + pre_contract_dims[2] = kernelRows * kernelCols; + pre_contract_dims[3] = 1; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[3] *= out.dimension(i); + } + } else { + pre_contract_dims[3] = kernelFilters; + pre_contract_dims[2] = inputRows * inputCols; + pre_contract_dims[1] = kernelRows * kernelCols; + pre_contract_dims[0] = 1; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= out.dimension(i); + } + } + + // The input has dimensions in_depth X (input_rows * input_cols) X OTHERS + DSizes input_dims; + if (isColMajor) { + input_dims[0] = kernelChannels; + input_dims[1] = inputRows * inputCols; + input_dims[2] = 1; + for (int i = 3; i < NumDims; ++i) { + input_dims[2] *= in.dimension(i); + } + eigen_assert(input_dims[2] == pre_contract_dims[3]); + } else { + input_dims[2] = kernelChannels; + input_dims[1] = inputRows * inputCols; + input_dims[0] = 1; + for (int i = 0; i < NumDims - 3; ++i) { + input_dims[0] *= in.dimension(i); + } + eigen_assert(input_dims[0] == pre_contract_dims[0]); + } + + // We will contract along dimensions (1, 2) in in and (1, 3) in out, if + // this is col-major. + // For row-major, it's dimensions (0, 1) in in and (0, 2) in out. + array, 2> contract_dims; + if (isColMajor) { + // col-major: in.contract(output.patches) + contract_dims[0] = IndexPair(1, 1); + contract_dims[1] = IndexPair(2, 3); + } else { + // row-major: output.patches.contract(in) + contract_dims[0] = IndexPair(0, 0); + contract_dims[1] = IndexPair(2, 1); + } + + // After the contraction, the kernel will have dimension + // in_depth X out_depth X kernel_rows X kernel_cols + // We will need to shuffle the first two dimensions and reverse the latter + // two dimensions. + // The end shape is + // out_depth X in_shape X kernel_rows X kernel_cols + + // This is the shape of the kernel *before* the shuffling. + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelChannels; + kernel_dims[1] = kernelFilters; + kernel_dims[2] = kernelRows; + kernel_dims[3] = kernelCols; + } else { + kernel_dims[0] = kernelCols; + kernel_dims[1] = kernelRows; + kernel_dims[2] = kernelFilters; + kernel_dims[3] = kernelChannels; + } + + array kernel_shuffle; + if (isColMajor) { + kernel_shuffle[0] = 1; + kernel_shuffle[1] = 0; + kernel_shuffle[2] = 2; + kernel_shuffle[3] = 3; + } else { + kernel_shuffle[0] = 0; + kernel_shuffle[1] = 1; + kernel_shuffle[2] = 3; + kernel_shuffle[3] = 2; + } + + array kernel_reverse; + if (isColMajor) { + kernel_reverse[0] = false; + kernel_reverse[1] = false; + kernel_reverse[2] = true; + kernel_reverse[3] = true; + } else { + kernel_reverse[0] = true; + kernel_reverse[1] = true; + kernel_reverse[2] = false; + kernel_reverse[3] = false; + } + + return choose(Cond::Layout == ColMajor>(), + input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle), + output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h new file mode 100644 index 00000000..dfb9dced --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h @@ -0,0 +1,179 @@ +#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H +#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H + +#include "Patch3d.h" + +namespace Eigen { + +/** CuboidConvolution + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 3D convolution over a multichannel input voxel block. + * + * The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others). + * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width). + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable). + * + * The input and kernel have to be in the same layout, and both row-major and + * col-major are supported. The shapes given above are for col-major layout. + * For row-major, all dimensions should be reversed. + * + * It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output. + */ +template +EIGEN_ALWAYS_INLINE +static const typename internal::conditional < + internal::traits::Layout == ColMajor, + TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorContractionOp< + const array::Index>, 1>, + const TensorReshapingOp< + const DSizes::Index, 2>, + const Kernel>, + const TensorReshapingOp< + const DSizes::Index, 2>, + const TensorVolumePatchOp > > >, + TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorContractionOp< + const array::Index>, 1>, + const TensorReshapingOp< + const DSizes::Index, 2>, + const TensorVolumePatchOp > , + const TensorReshapingOp< + const DSizes::Index, 2>, + const Kernel> > > >::type +CuboidConvolution(const Input& input, const Kernel& kernel, + const DenseIndex stridePlanes = 1, + const DenseIndex strideRows = 1, + const DenseIndex strideCols = 1, + const PaddingType padding_type = PADDING_SAME) { + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > kern(kernel); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits::Layout == ColMajor); + static const int NumDims = internal::traits::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result. + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4]; + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3]; + + // Spatial size of the kernel. + const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0]; + + if (isColMajor) { + eigen_assert(kernelChannels == in.dimension(0)); + } else { + eigen_assert(kernelChannels == in.dimension(NumDims - 1)); + } + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + const float stride_planes_f = static_cast(stridePlanes); + const float stride_rows_f = static_cast(strideRows); + const float stride_cols_f = static_cast(strideCols); + TensorIndex out_depth; + TensorIndex out_height; + TensorIndex out_width; + switch (padding_type) { + case PADDING_VALID: + out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f); + out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f); + out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f); + break; + case PADDING_SAME: + out_depth = ceil(inputPlanes / stride_planes_f); + out_height = ceil(inputRows / stride_rows_f); + out_width = ceil(inputCols / stride_cols_f); + break; + default: + eigen_assert(false && "unexpected padding"); + } + + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols; + kernel_dims[1] = kernelFilters; + } + + // Molds the output of the patch extraction result into a 2D tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the kernels + // - the second dimension (dims[1]): everything else + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols; + pre_contract_dims[1] = out_depth * out_height * out_width; + for (int i = 4; i < NumDims; ++i) { + pre_contract_dims[1] *= in.dimension(i); + } + } else { + pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols; + pre_contract_dims[0] = out_depth * out_height * out_width; + for (int i = 0; i < NumDims - 4; ++i) { + pre_contract_dims[0] *= in.dimension(i); + } + } + + array, 1> contract_dims; + contract_dims[0] = IndexPair(1, 0); + + // Molds the output of the contraction into the shape expected by the user + // (assuming ColMajor): + // - 1st dim: kernel filters + // - 2nd dim: output depth + // - 3nd dim: output height + // - 4rd dim: output width + // - 5th dim and beyond: everything else including batch size + DSizes post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = out_depth; + post_contract_dims[2] = out_height; + post_contract_dims[3] = out_width; + for (int i = 4; i < NumDims; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelFilters; + post_contract_dims[NumDims - 2] = out_depth; + post_contract_dims[NumDims - 3] = out_height; + post_contract_dims[NumDims - 4] = out_width; + for (int i = 0; i < NumDims - 4; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } + + return choose( + Cond::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input.extract_volume_patches( + kernelDepth, kernelRows, kernelCols, stridePlanes, + strideRows, strideCols, padding_type) + .reshape(pre_contract_dims), + contract_dims) + .reshape(post_contract_dims), + input.extract_volume_patches(kernelDepth, kernelRows, kernelCols, + stridePlanes, strideRows, strideCols, + padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims) + .reshape(post_contract_dims)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h new file mode 100644 index 00000000..89190eb1 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h @@ -0,0 +1,240 @@ +#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H +#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H + +#if not defined(__CUDACC__) +#include +#endif + +namespace Eigen { +namespace internal { + +/** Extract3DPatches + * \ingroup CXX11_NeuralNetworksModule + * + * \brief Extracts 3D patches from a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 4 or more + * (channels, depth, height, width, optional others in col-major, and the + * reverse order in row-major). + + * The return value will be a tensor of 3 more dimension than the input tensor. + * In col-major, the first 4 dimensions of the result are: channels, patch_depth, + * patch_height, patch_width. The next dimensions will identify the patch + * position on the 3D grid of extracted patches: z, y, x. The remaining + * dimensions, if any, will be the same as the 'other' dimensions of the input + * tensor. + */ + +template +EIGEN_ALWAYS_INLINE static const TensorStridingOp< + const array::Index, + internal::traits::NumDimensions + 3>, + const TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions + 3>, + const TensorPatchOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorPaddingOp< + const array::Index>, + internal::traits::NumDimensions>, + const Input> > > > +Extract3DPatches( + const Input& input, const DenseIndex patchPlanes, + const DenseIndex patchRows, const DenseIndex patchCols, + const DenseIndex stridePlanes, const DenseIndex strideRows, + const DenseIndex strideCols, + const DenseIndex paddingZTop, const DenseIndex paddingZBottom, + const DenseIndex paddingTop, const DenseIndex paddingBottom, + const DenseIndex paddingLeft, const DenseIndex paddingRight, + const typename internal::traits::Scalar padding_value = 0) { + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + static const int NumDims = internal::traits::NumDimensions; + static const int ExtDims = NumDims + 3; + + // Tensor size after patch extraction. We add three dimensions to unpack the + // linear patch index into a 3D grid over which stride() can work. + DSizes pre_stride_dims; + + if (isColMajor) { + pre_stride_dims[0] = in.dimension(0); + pre_stride_dims[1] = patchPlanes; + pre_stride_dims[2] = patchRows; + pre_stride_dims[3] = patchCols; + } else { + pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1); + pre_stride_dims[ExtDims - 4] = patchCols; + pre_stride_dims[ExtDims - 3] = patchRows; + pre_stride_dims[ExtDims - 2] = patchPlanes; + } + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + array, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = IndexPair(0, 0); + } + + paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair(paddingZTop, paddingZBottom); + paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair(paddingTop, paddingBottom); + paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair(paddingLeft, paddingRight); + + pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1; + pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1; + pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1; + + if (isColMajor) { + for (int i = 7; i < NumDims + 3; ++i) { + pre_stride_dims[i] = in.dimension(i - 3); + } + } else { + for (int i = 0; i < NumDims - 4; ++i) { + pre_stride_dims[i] = in.dimension(i); + } + } + + DSizes patch_dims; + if (isColMajor) { + patch_dims[0] = in.dimension(0); + patch_dims[1] = patchPlanes; + patch_dims[2] = patchRows; + patch_dims[3] = patchCols; + for (int i = 4; i < NumDims; ++i) { + patch_dims[i] = 1; + } + } else { + patch_dims[NumDims - 1] = in.dimension(NumDims - 1); + patch_dims[NumDims - 4] = patchCols; + patch_dims[NumDims - 3] = patchRows; + patch_dims[NumDims - 2] = patchPlanes; + for (int i = 0; i < NumDims - 4; i++) { + patch_dims[i] = 1; + } + } + + array strides; + if (isColMajor) { + // No striding within the patches. + for (int i = 0; i < 4; ++i) { + strides[i] = 1; + } + // Apply striding in the spatial patch grid dimensions only. + strides[4] = stridePlanes; + strides[5] = strideRows; + strides[6] = strideCols; + // No striding in the remaining dimensions (batches, ...). + for (int i = 7; i < NumDims + 3; i++) { + strides[i] = 1; + } + } else { + // No striding within the patches. + for (int i = 1; i <= 4; ++i) { + strides[ExtDims - i] = 1; + } + // Apply striding in the spatial patch grid dimensions only. + strides[ExtDims - 7] = strideCols; + strides[ExtDims - 6] = strideRows; + strides[ExtDims - 5] = stridePlanes; + // No striding in the remaining dimensions (batches, ...). + for (int i = 0; i < NumDims - 4; i++) { + strides[i] = 1; + } + } + + // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend + // extract_patches to take additional parameters for padding/striding, + // similarly to etract_image_patches. + return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides); +} + + +template +EIGEN_ALWAYS_INLINE static const TensorStridingOp< + const array::Index, + internal::traits::NumDimensions + 3>, + const TensorReshapingOp< + const DSizes::Index, + internal::traits::NumDimensions + 3>, + const TensorPatchOp< + const DSizes::Index, + internal::traits::NumDimensions>, + const TensorPaddingOp< + const array::Index>, + internal::traits::NumDimensions>, + const Input> > > > +Extract3DPatches( + const Input& input, const DenseIndex patchPlanes, + const DenseIndex patchRows, const DenseIndex patchCols, + const DenseIndex stridePlanes, const DenseIndex strideRows, + const DenseIndex strideCols, const PaddingType padding_type, + const typename internal::traits::Scalar padding_value = 0) { + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + static const int NumDims = internal::traits::NumDimensions; + + const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4); + + switch (padding_type) { + case PADDING_VALID: + // No padding in any dimension. + return Extract3DPatches(input, patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + 0, 0, 0, 0, 0, 0, padding_value); + case PADDING_SAME: { + // The side of the tensor before striding should be just the expected + // output times the stride. + const TensorIndex size_z = ceil(inputPlanes / static_cast(stridePlanes)) * stridePlanes; + const TensorIndex size_y = ceil(inputRows / static_cast(strideRows)) * strideRows; + const TensorIndex size_x = ceil(inputCols / static_cast(strideCols)) * strideCols; + + // The size of the patch space is going to be: padded_input_size - patch_size + 1. + // This has to match the expected size before striding (pre_stride_dims). + // The deltas below extend the input to the expected size. + const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes; + const TensorIndex dy = size_y + patchRows - 1 - inputRows; + const TensorIndex dx = size_x + patchCols - 1 - inputCols; + + return Extract3DPatches(input, patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + dz - dz / 2, dz / 2, + dy - dy / 2, dy / 2, + dx - dx / 2, dx / 2, + padding_value); + } + default: + eigen_assert(false && "unexpected padding"); + // unreachable code to avoid missing return warning. + return Extract3DPatches(input, patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + 0, 0, 0, 0, 0, 0, padding_value); + } +} + +// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11. +template +struct Extract3DPatchesType { + typedef const TensorStridingOp< const array::Index, internal::traits::NumDimensions + 3>, + const TensorReshapingOp< const DSizes::Index, internal::traits::NumDimensions + 3>, + const TensorPatchOp< const DSizes::Index, internal::traits::NumDimensions>, + const TensorPaddingOp< const array< IndexPair::Index>, internal::traits::NumDimensions>, + const Input> > > > type; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h new file mode 100644 index 00000000..942b060b --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h @@ -0,0 +1,433 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H +#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H + +#include "Patch3d.h" + +namespace Eigen { + +/** SpatialMaxPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a max-pooling over a multichannel input image. + * + * The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorReductionOp::Scalar>::type>, const Eigen::array, const TensorImagePatchOp > > +#else +template +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorReductionOp::Scalar>::type>, typename internal::conditional::Layout == ColMajor, const Eigen::IndexList, Eigen::type2index<2> >, const Eigen::IndexList, Eigen::type2index<3> > >::type, const TensorImagePatchOp > > +#endif +SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols, + DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type, + DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) +{ + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1); + const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + static const int idxRows = isColMajor ? 1 : 2; + static const int idxCols = isColMajor ? 2 : 1; + + // Molds the output of the reduction into the shape expected by the user. + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + Eigen::DSizes::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast(strideCols)); + } else { + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast(strideCols)); + } + post_reduce_dims[3] = in.dimension(3); + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array reduction_dims; + if (isColMajor) { + reduction_dims[0] = 1; + reduction_dims[1] = 2; + } else { + reduction_dims[0] = 2; + reduction_dims[1] = 3; + } +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + typename internal::conditional::Layout == ColMajor, const Eigen::IndexList, Eigen::type2index<2> >, const Eigen::IndexList, Eigen::type2index<3> > >::type reduction_dims; +#endif + + return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims); +} + +/** CuboidMaxPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a max-pooling over a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the depth, width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes::NumDimensions>, + const TensorReductionOp< + internal::MaxReducer, const Eigen::array, + const TensorReshapingOp< + const Eigen::DSizes, + const TensorVolumePatchOp > > > +#else +template +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes::NumDimensions>, + const TensorReductionOp< + internal::MaxReducer, + const Eigen::IndexList >, + const TensorReshapingOp< + const Eigen::DSizes, + const TensorVolumePatchOp > > > +#endif +CuboidMaxPooling(const Input& input, DenseIndex patchPlanes, + DenseIndex patchRows, DenseIndex patchCols, + DenseIndex stridePlanes, DenseIndex strideRows, + DenseIndex strideCols, const PaddingType padding_type) { + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + static const int idxPlanes = isColMajor ? 1 : 3; + static const int idxRows = 2; + static const int idxCols = isColMajor ? 3 : 1; + + // Molds the output of the reduction into the shape expected by the used + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output depth + // - 3rd dim: output height + // - 4th dim: output width + // - 5th dim and beyond: everything else including batch size + Eigen::DSizes::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast(strideCols)); + } else { + post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast(strideCols)); + } + post_reduce_dims[4] = in.dimension(4); + + Eigen::DSizes pre_reduce_dims; + pre_reduce_dims[1] = patchRows * patchCols * patchPlanes; + if (isColMajor) { + pre_reduce_dims[0] = post_reduce_dims[0]; + pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4]; + } else { + pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3]; + pre_reduce_dims[2] = post_reduce_dims[4]; + } + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array reduction_dims; + reduction_dims[0] = 1; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList > reduction_dims; +#endif + return input.extract_volume_patches(patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + padding_type, -Eigen::NumTraits::highest()) + .reshape(pre_reduce_dims) + .maximum(reduction_dims) + .reshape(post_reduce_dims); +} + + +/** SpatialAvgPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies an average pooling over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the width and height dimensions can be swapped if needed. + * +*/ +namespace internal { + +template struct AvgPoolMeanReducer +{ +#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) + // We only support packet access for floats. + static const bool PacketAccess = internal::is_same::value; +#else + static const bool PacketAccess = false; +#endif + static const bool IsStateful = true; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) { + typedef typename packet_traits::type Packet; + packetCount_ = pset1(0.0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + if (t != -Eigen::NumTraits::highest()) { + (*accum) = (*accum) + t; + scalarCount_++; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + eigen_assert(scalarCount_ > 0); + return accum / scalarCount_; + } + +#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) +#ifdef EIGEN_VECTORIZE_AVX +#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ) +#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask) +#else +#define pequal(a,b) _mm_cmpeq_ps(a,b) +#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b)) +#endif + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + reducePacketWithType(static_cast(0), p, accum); + } + + template + void reducePacketWithType(T, const Packet& p, Packet* accum) { + Packet skip_mask = pequal(p, pset1(-Eigen::NumTraits::highest())); + (*accum) = padd(*accum, psel(p, pset1(0), skip_mask)); + packetCount_ = padd(packetCount_, psel(pset1(1), pset1(0), skip_mask)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, packetCount_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_)); + } +#endif + + protected: + typedef typename packet_traits::type Packet; + int scalarCount_; + Packet packetCount_; +}; + +} // namespace internal + +#if !defined(EIGEN_HAS_INDEX_LIST) +template +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorReductionOp::Scalar>::type>, const Eigen::array, const TensorImagePatchOp > > +#else +template +EIGEN_ALWAYS_INLINE +static const TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorReductionOp::Scalar>::type>, typename internal::conditional::Layout == ColMajor, const Eigen::IndexList, Eigen::type2index<2> >, const Eigen::IndexList, Eigen::type2index<3> > >::type, const TensorImagePatchOp > > +#endif +SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols, + DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type, + DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1) +{ + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1); + const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1); + + static const bool isColMajor = (internal::traits::Layout == ColMajor); + static const int idxRows = isColMajor ? 1 : 2; + static const int idxCols = isColMajor ? 2 : 1; + + // Molds the output of the reduction into the shape expected by the user. + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + Eigen::DSizes::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast(strideCols)); + } else { + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast(strideCols)); + } + post_reduce_dims[3] = in.dimension(3); + + typedef typename internal::remove_const::Scalar>::type CoeffReturnType; + internal::AvgPoolMeanReducer mean_with_nan; + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array reduction_dims; + if (isColMajor) { + reduction_dims[0] = 1; + reduction_dims[1] = 2; + } else { + reduction_dims[0] = 2; + reduction_dims[1] = 3; + } +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + typename internal::conditional::Layout == ColMajor, const Eigen::IndexList, Eigen::type2index<2> >, const Eigen::IndexList, Eigen::type2index<3> > >::type reduction_dims; +#endif + return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims); +} + + +/** CuboidAvgPooling + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies an average pooling over a multichannel input volume. + * + * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major). + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major). + * + * The order of the depth, width and height dimensions can be swapped if needed. + * +*/ +#if !defined(EIGEN_HAS_INDEX_LIST) +template +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes::NumDimensions>, + const TensorReductionOp< + internal::AvgPoolMeanReducer, const Eigen::array, + const TensorReshapingOp< + const Eigen::DSizes, + const TensorVolumePatchOp > > > +#else +template +EIGEN_ALWAYS_INLINE static const TensorReshapingOp< + const Eigen::DSizes::NumDimensions>, + const TensorReductionOp< + internal::AvgPoolMeanReducer, + const Eigen::IndexList >, + const TensorReshapingOp< + const Eigen::DSizes, + const TensorVolumePatchOp > > > +#endif +CuboidAvgPooling(const Input& input, DenseIndex patchPlanes, + DenseIndex patchRows, DenseIndex patchCols, + DenseIndex stridePlanes, DenseIndex strideRows, + DenseIndex strideCols, const PaddingType padding_type) { + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + + static const int idxPlanes = isColMajor ? 1 : 3; + static const int idxRows = 2; + static const int idxCols = isColMajor ? 3 : 1; + // Molds the output of the reduction into the shape expected by the used + // (assuming col-major): + // - 1st dim: channels + // - 2nd dim: outupt depth + // - 3rd dim: output height + // - 4th dim: output width + // - 5th dim and beyond: everything else including batch size + Eigen::DSizes::NumDimensions> post_reduce_dims; + post_reduce_dims[0] = in.dimension(0); + if (padding_type == PADDING_VALID) { + post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast(strideCols)); + } else { + post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast(stridePlanes)); + post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast(strideRows)); + post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast(strideCols)); + } + post_reduce_dims[4] = in.dimension(4); + + Eigen::DSizes pre_reduce_dims; + pre_reduce_dims[1] = patchRows * patchCols * patchPlanes; + if (isColMajor) { + pre_reduce_dims[0] = post_reduce_dims[0]; + pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4]; + } else { + pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3]; + pre_reduce_dims[2] = post_reduce_dims[4]; + } + + typedef typename internal::remove_const::Scalar>::type CoeffReturnType; + internal::AvgPoolMeanReducer mean_with_nan; + +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array reduction_dims; + reduction_dims[0] = 1; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList > reduction_dims; +#endif + return input.extract_volume_patches(patchPlanes, patchRows, patchCols, + stridePlanes, strideRows, strideCols, + padding_type, -Eigen::NumTraits::highest()) + .reshape(pre_reduce_dims) + .reduce(reduction_dims, mean_with_nan) + .reshape(post_reduce_dims); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h new file mode 100644 index 00000000..f0e21ab9 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h @@ -0,0 +1,83 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H +#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H + +namespace Eigen { + +/** SoftMax + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a softmax + * + * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other). + * + * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order. + * +*/ + +namespace { +class SoftmaxOp { + public: + EIGEN_ALWAYS_INLINE SoftmaxOp(const float beta) : beta_(beta) { } + + template EIGEN_ALWAYS_INLINE + typename Input::Dimensions dimensions(const Input& input) const { + return input.dimensions(); + } + + template + void eval(const Input& input, Output& output, const Device& device) const + { +#if !defined(EIGEN_HAS_INDEX_LIST) + // nvcc doesn't support cxx11 + Eigen::array::Index, 1> depth_dim; + depth_dim[0] = 0; + Eigen::array::Index, 2> bcast; + bcast[0] = dimensions(input)[0]; + bcast[1] = 1; + DSizes::Index, 2> dims2d; + dims2d[0] = 1; + dims2d[1] = dimensions(input)[1]; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> depth_dim; + Eigen::IndexList> bcast; + bcast.set(0, dimensions(input)[0]); + Eigen::IndexList, typename internal::traits::Index> dims2d; + dims2d.set(1, dimensions(input)[1]); +#endif + + output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp(); + output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + } + + private: + const float beta_; +}; +} + + +template +EIGEN_ALWAYS_INLINE +static const TensorCustomUnaryOp +SoftMax(const Input& input, const float beta) +{ + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const SoftmaxOp op(beta); + return input.customOp(op); +} + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h new file mode 100644 index 00000000..8e2ddca6 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h @@ -0,0 +1,775 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H +#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H + +namespace Eigen { + +namespace internal { + +// These optimizations require vector instructions +#ifdef EIGEN_VECTORIZE + +// TODO: Consolidate this part of the code with the image patch extraction code +// since they are both very similar. +template +class TensorContractionInputMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> +{ + public: + typedef TensorContractionInputMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef TensorContractionSubMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper VectorMapper; + typedef SubMapper LinearMapper; + typedef Scalar_ Scalar; + typedef typename packet_traits::type Packet; + + TensorContractionInputMapper(const TensorEvaluator >, Device>& tensor, + const nocontract_t&, const nocontract_t&, + const contract_t&, const contract_t&) + : m_impl(tensor.impl().impl()) + { + Index patch_rows; + Index patch_depth; + if (internal::traits::Layout == ColMajor) { + patch_depth = tensor.impl().dimensions()[0]; + patch_rows = tensor.impl().dimensions()[1]; + m_patch_cols = tensor.impl().dimensions()[2]; + m_num_patches = tensor.impl().dimensions()[3]; + } else { + static const int NumDims = tensor.impl().dimensions().size(); + patch_depth = tensor.impl().dimensions()[NumDims - 1]; + patch_rows = tensor.impl().dimensions()[NumDims - 2]; + m_patch_cols = tensor.impl().dimensions()[NumDims - 3]; + m_num_patches = tensor.impl().dimensions()[NumDims - 4]; + } + m_patch_row_inflate_strides = tensor.impl().rowInflateStride(); + m_patch_col_inflate_strides = tensor.impl().colInflateStride(); + + m_colStride = patch_rows; + + m_outputRows = tensor.impl().outputRows(); + m_row_strides = tensor.impl().userRowStride(); + m_col_strides = tensor.impl().userColStride(); + + m_in_row_strides = tensor.impl().userInRowStride(); + m_in_col_strides = tensor.impl().userInColStride(); + + if (internal::traits::Layout == ColMajor) { + m_inputRows = tensor.impl().impl().dimensions()[1]; + m_inputCols = tensor.impl().impl().dimensions()[2]; + } else { + static const int NumDims = tensor.impl().impl().dimensions().size(); + m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2]; + m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3]; + } + + m_rowInputStride = patch_depth; + m_colInputStride = patch_depth * m_inputRows; + m_patchInputStride = patch_depth * m_inputRows * m_inputCols; + + m_rowPaddingTop = tensor.impl().rowPaddingTop(); + m_colPaddingLeft = tensor.impl().colPaddingLeft(); + + m_fastInputRowStride = internal::TensorIntDivisor(m_patch_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor(m_patch_col_inflate_strides); + m_fastNumPatches = internal::TensorIntDivisor(m_num_patches); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + m_fastDimZero = internal::TensorIntDivisor(patch_depth); + } + + TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper) : + m_impl(base_mapper.m_impl) { + m_patch_cols = base_mapper.m_patch_cols; + m_num_patches = base_mapper.m_num_patches; + m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides; + m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides; + + m_colStride = base_mapper.m_colStride; + + m_rowInputStride = base_mapper.m_rowInputStride; + m_colInputStride = base_mapper.m_colInputStride; + m_patchInputStride = base_mapper.m_patchInputStride; + + m_inputRows = base_mapper.m_inputRows; + m_inputCols = base_mapper.m_inputCols; + + m_outputRows = base_mapper.m_outputRows; + m_row_strides = base_mapper.m_row_strides; + m_col_strides = base_mapper.m_col_strides; + + m_in_row_strides = base_mapper.m_in_row_strides; + m_in_col_strides = base_mapper.m_in_col_strides; + + m_rowPaddingTop = base_mapper.m_rowPaddingTop; + m_colPaddingLeft = base_mapper.m_colPaddingLeft; + + m_fastInputRowStride = base_mapper.m_fastInputRowStride; + m_fastInputColStride = base_mapper.m_fastInputColStride; + m_fastNumPatches = base_mapper.m_fastNumPatches; + m_fastColStride = base_mapper.m_fastColStride; + m_fastOutputRows = base_mapper.m_fastOutputRows; + m_fastDimZero = base_mapper.m_fastDimZero; + } + + // If true, turns off some optimizations for loading packets since the image + // patches are "non-standard" such as there are non-trivial strides or + // inflations in the input. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { + return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(0, rowIndex, colIndex, otherIndex); + return loadCoeff(row, rowIndex, colIndex, otherIndex); + } + + // Load the coefficient at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadCoeff(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(0, rowIndex, colIndex, otherIndex); + return loadPacket(row, rowIndex, colIndex, otherIndex); + } + + // Load the packet at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadPacket(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE const TensorEvaluator& impl() const { return m_impl; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const { + const Index inputIndex = depth + baseIndex; + return m_impl.template packet(inputIndex); + } + + private: + friend class TensorContractionSubMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset * m_in_col_strides; + const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex + rowOffset * m_in_row_strides; + const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows | + (inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) { + return Scalar(0); + } + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + eigen_assert(!nonStandardPatches()); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex + rowOffset; + if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) { + return Scalar(0); + } + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols); + + if (nonStandardPatches()) { + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols); + + eigen_assert(!nonStandardPatches()); + + if ((patchDepth() % packetSize) == 0) { + return loadPacketFast(patchId, rowIndex, colIndex, otherIndex); + } + else { + const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero}; + + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]}; + if (inputCols[0] >= m_inputCols | inputCols[1] < 0) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows | inputRows[1] < 0) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) { + // no padding + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; + return m_impl.template packet(inputIndex); + } + } + } + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols); + + eigen_assert(!nonStandardPatches()); + eigen_assert((patchDepth() % packetSize) == 0); + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset); + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset; + const Index rowOffset = patchOffset - colOffset*m_colStride; + const Index inputRow = rowIndex + rowOffset; + if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) { + // all zeros + return internal::pset1(Scalar(0)); + } + // no padding + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.template packet(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex); + } + Packet rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const { + const int NumInputDims = array_size::Dimensions>::value; + otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; + const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); + otherIndex *= m_patchInputStride; + colIndex = patch2DIndex / m_fastOutputRows; + rowIndex = patch2DIndex - colIndex * m_outputRows; + colIndex = colIndex * m_col_strides - m_colPaddingLeft; + rowIndex = rowIndex * m_row_strides - m_rowPaddingTop; + } + + Index m_patch_cols; // number of colums in the patch + Index m_num_patches; // number of patches to extract. + Index m_patch_row_inflate_strides; // the strides for row inflation in the image patch + Index m_patch_col_inflate_strides; // the strides for col inflation in the image patch + // Fast representation of inflation strides. + internal::TensorIntDivisor m_fastInputRowStride; + internal::TensorIntDivisor m_fastInputColStride; + + Index m_otherStride; + Index m_colStride; + internal::TensorIntDivisor m_fastNumPatches; + internal::TensorIntDivisor m_fastColStride; + + Index m_rowInputStride; // row stride in the input tensor + Index m_colInputStride; // col stride in the input tensor + Index m_patchInputStride; // patch stride in the input tensor + + Index m_inputRows; // Number of rows in the input tensor + Index m_inputCols; // Number of cols in the input tensor + + Index m_outputRows; // Number of patch rows + + Index m_row_strides; // User specified row stride + Index m_col_strides; // User specified col stride + + Index m_in_row_strides; // User specified input row stride + Index m_in_col_strides; // User specified input col stride + + Index m_rowPaddingTop; // Row padding + Index m_colPaddingLeft; // Column padding + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastDimZero; + + const TensorEvaluator m_impl; +}; + + +template +class TensorContractionSubMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> +{ + public: + typedef Scalar_ Scalar; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; + typedef TensorContractionSubMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_depth_offset(vert_offset), m_col_offset(horiz_offset) { + m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper.m_base_mapper), m_depth_offset(vert_offset+base_mapper.m_depth_offset), m_col_offset(horiz_offset+base_mapper.m_col_offset) { + m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_depth_offset, j + m_col_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.template loadPacket(i + m_depth_offset, j + m_col_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const { + return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const { + return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const { + return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + template + EIGEN_DEVICE_FUNC bool aligned(Index) const { + return false; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { + return m_base_mapper.nonStandardPatches(); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const { + const Index inputIndex = depth + baseIndex; + return m_base_mapper.m_impl.template packet(inputIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padRow(const Index row) const { + const Index r = m_rowIndex + row; + return r < 0 | r >= m_base_mapper.m_inputRows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padCol(const Index col) const { + const Index c = m_colIndex + col; + return c < 0 | c >= m_base_mapper.m_inputCols; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const { + const Index r = m_rowIndex + row; + const Index c = m_colIndex + col; + return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index rowOffset() const { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + return patchOffset-colOffset*m_base_mapper.m_colStride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index colOffset() const { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + return colOffset; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index depthOffset() const { + const Index patchOffset = m_depth_offset % m_base_mapper.patchDepth(); + return patchOffset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset); + } + + private: + const ParentMapper& m_base_mapper; // that was a reference before + Index m_depth_offset; // First row in the input matrix + Index m_col_offset; // First col in the input matrix + + Index m_rowIndex; // precomputed row index corresponding to the col offset + Index m_colIndex; // precomputed col index corresponding to the col offset + Index m_otherIndex; // precomputed other index corresponding to the col offset + +}; + + +template +struct gemm_pack_rhs >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> { + + typedef TensorContractionSubMapper >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper DataMapper; + + static inline Index ceil_div(Index a, Index b) { + return (a + b - 1) / b; + } + + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + typedef typename DataMapper::LinearMapper LinearMapper; + typedef typename packet_traits::type Packet; + + const Index packet_cols4 = (cols/4) * 4; + const Index peeled_k = (depth/packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for(Index j2=0; j2(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols); + + for (Index c = startCol; c < max_cols; ++c) { + eigen_assert(k < peeled_k); + const Index startRow = (c == startCol) ? rhs.rowOffset() : 0; + const Index max_rows = std::min(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + for (Index r = startRow; r < max_rows; ++r) { + eigen_assert(k < peeled_k); + const bool pad0 = pad_col0 || dm0.padRow(r); + const bool pad1 = pad_col1 || dm1.padRow(r); + const bool pad2 = pad_col2 || dm2.padRow(r); + const bool pad3 = pad_col3 || dm3.padRow(r); + + const Index idx0 = dm0.baseIndex(r, c); + const Index idx1 = dm1.baseIndex(r, c); + const Index idx2 = dm2.baseIndex(r, c); + const Index idx3 = dm3.baseIndex(r, c); + + const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0; + const Index max_depth = std::min(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth); + eigen_assert(max_depth % packet_size == 0); + for (Index d = startDepth; d < max_depth; d += packet_size) { + eigen_assert(k < peeled_k); + PacketBlock kernel; + kernel.packet[0] = pad0 ? pset1(0) : rhs.packetNoPadding(d, idx0); + kernel.packet[1] = pad1 ? pset1(0) : rhs.packetNoPadding(d, idx1); + kernel.packet[2] = pad2 ? pset1(0) : rhs.packetNoPadding(d, idx2); + kernel.packet[3] = pad3 ? pset1(0) : rhs.packetNoPadding(d, idx3); + ptranspose(kernel); + pstoreu(block+0*packet_size, kernel.packet[0]); + pstoreu(block+1*packet_size, kernel.packet[1]); + pstoreu(block+2*packet_size, kernel.packet[2]); + pstoreu(block+3*packet_size, kernel.packet[3]); + block+=4*packet_size; + k += packet_size; + } + } + } + + for(; k kernel; + kernel.packet[0] = dm0.loadPacketFast(k); + kernel.packet[1] = dm1.loadPacketFast(k); + kernel.packet[2] = dm2.loadPacketFast(k); + kernel.packet[3] = dm3.loadPacketFast(k); + ptranspose(kernel); + pstoreu(block+0*packet_size, kernel.packet[0]); + pstoreu(block+1*packet_size, kernel.packet[1]); + pstoreu(block+2*packet_size, kernel.packet[2]); + pstoreu(block+3*packet_size, kernel.packet[3]); + block+=4*packet_size; + } + } + else { + for(; k kernel; + kernel.packet[0] = dm0.loadPacketStandard(k); + kernel.packet[1] = dm1.loadPacketStandard(k); + kernel.packet[2] = dm2.loadPacketStandard(k); + kernel.packet[3] = dm3.loadPacketStandard(k); + ptranspose(kernel); + pstoreu(block+0*packet_size, kernel.packet[0]); + pstoreu(block+1*packet_size, kernel.packet[1]); + pstoreu(block+2*packet_size, kernel.packet[2]); + pstoreu(block+3*packet_size, kernel.packet[3]); + block+=4*packet_size; + } + } + } + if (!rhs.nonStandardPatches()) { + for(; k 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels. + * + * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output. + * + */ +template +EIGEN_ALWAYS_INLINE +static const typename internal::conditional< + internal::traits::Layout == ColMajor, + TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorContractionOp::Index>, 1>, const TensorReshapingOp::Index, 2>, const Kernel>, const TensorReshapingOp::Index, 2>, const TensorImagePatchOp > > >, + TensorReshapingOp::Index, internal::traits::NumDimensions>, const TensorContractionOp::Index>, 1>, const TensorReshapingOp::Index, 2>, const TensorImagePatchOp >, const TensorReshapingOp::Index, 2>, const Kernel> > > >::type +SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) { + + typedef typename internal::traits::Index TensorIndex; + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > in(input); + TensorRef::Scalar, internal::traits::NumDimensions, internal::traits::Layout, TensorIndex> > kern(kernel); + + EIGEN_STATIC_ASSERT(internal::traits::Layout == internal::traits::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE); + static const bool isColMajor = (internal::traits::Layout == ColMajor); + + static const int NumDims = internal::traits::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0]; + + const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1); + const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1); + + array, 1> contract_dims; + contract_dims[0] = IndexPair(1, 0); + + const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + + TensorIndex out_height; + TensorIndex out_width; + switch (padding_type) { + case PADDING_VALID: + out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast(stride)); + out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast(stride)); + break; + case PADDING_SAME: + out_height = numext::ceil(InputRows / static_cast(stride)); + out_width = numext::ceil(InputCols / static_cast(stride)); + break; + default: + eigen_assert(false && "unexpected padding"); + } + + // Molds the output of the patch extraction code into a 2d tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the kernels + // - the second dimension (dims[1]): everything else + DSizes pre_contract_dims; + if (isColMajor) { + pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[1] = out_height * out_width; + for (int i = 3; i < NumDims; ++i) { + pre_contract_dims[1] *= in.dimension(i); + } + } else { + pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[0] = out_height * out_width; + for (int i = 0; i < NumDims - 3; ++i) { + pre_contract_dims[0] *= in.dimension(i); + } + } + + // Molds the output of the contraction into the shape expected by the used + // (assuming this is ColMajor): + // - 1st dim: kernel filters + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + DSizes post_contract_dims; + if (isColMajor) { + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = out_height; + post_contract_dims[2] = out_width; + for (int i = 3; i < NumDims; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } else { + post_contract_dims[NumDims - 1] = kernelFilters; + post_contract_dims[NumDims - 2] = out_height; + post_contract_dims[NumDims - 3] = out_width; + for (int i = 0; i < NumDims - 3; ++i) { + post_contract_dims[i] = in.dimension(i); + } + } + + DSizes kernel_dims; + if (isColMajor) { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels * kernelRows * kernelCols; + } else { + kernel_dims[0] = kernelChannels * kernelRows * kernelCols; + kernel_dims[1] = kernelFilters; + } + // TODO(yangke): choose() is defined in TensorContraction.h -- consider + // moving it to somewhere more "common". + return choose(Cond::Layout == ColMajor>(), + kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims), + input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims)); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h new file mode 100644 index 00000000..0e721735 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h @@ -0,0 +1,289 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H + +namespace Eigen { + +/** \class TensorConvolutionByFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionByFFTOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionByFFTOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionByFFTOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionByFFTOp XprType; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef typename Eigen::NumTraits::Real RealScalar; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + m_kernelImpl.evalSubExprsIfNeeded(NULL); + + typedef typename internal::traits::Index TensorIndex; + + Tensor input(m_inputImpl.dimensions()); + for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) { + input.data()[i] = m_inputImpl.coeff(i); + } + + Tensor kernel(m_kernelImpl.dimensions()); + for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) { + kernel.data()[i] = m_kernelImpl.coeff(i); + } + + array, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]); + } + + Eigen::array reverse; + for (int i = 0; i < NumKernelDims; ++i) { + reverse[i] = true; + } + + Eigen::array fft; + for (int i = 0; i < NumDims; ++i) { + fft[i] = i; + } + + Eigen::DSizes slice_offsets; + for (int i = 0; i < NumDims; ++i) { + slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1; + } + + Eigen::DSizes slice_extents; + for (int i = 0; i < NumDims; ++i) { + slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1; + } + + Tensor kernel_variant = kernel.reverse(reverse).pad(paddings); + Tensor, NumDims, Layout, TensorIndex> kernel_fft = kernel_variant.template fft(fft); + //Tensor, NumDims, Layout|IndexType> kernel_fft = kernel.reverse(reverse).pad(paddings).template fft<2>(fft); + Tensor, NumDims, Layout, TensorIndex> input_fft = input.template fft(fft); + Tensor, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft(fft); + Tensor, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents); + + for (int i = 0; i < tensor_result.size(); ++i) { + data[i] = std::real(tensor_result.data()[i]); + } + return false; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + return result; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H diff --git a/third_party/eigen3/unsupported/Eigen/SpecialFunctions b/third_party/eigen3/unsupported/Eigen/SpecialFunctions new file mode 100644 index 00000000..ad13359a --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/SpecialFunctions @@ -0,0 +1 @@ +#include "unsupported/Eigen/SpecialFunctions" diff --git a/third_party/gemmlowp/LICENSE b/third_party/gemmlowp/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/third_party/gemmlowp/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. -- GitLab