“f5ec6e34c6499980dca82918bf7c166289a48f79”上不存在“examples/aishell3/vits-vc/path.sh”
提交 fe0cdf27 编写于 作者: 李寅

Merge branch 'kernels_benchmark' into 'master'

Add memory access and matmul benchmark against eigen

See merge request !539
......@@ -9,7 +9,7 @@ http_archive(
strip_prefix = "protobuf-3.4.0",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0.zip",
"https://github.com/google/protobuf/archive/v3.4.0.zip"
"https://github.com/google/protobuf/archive/v3.4.0.zip",
],
)
......@@ -20,7 +20,7 @@ new_http_archive(
strip_prefix = "googletest-release-1.8.0",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/googletest/googletest-release-1.8.0.zip",
"https://github.com/google/googletest/archive/release-1.8.0.zip"
"https://github.com/google/googletest/archive/release-1.8.0.zip",
],
)
......@@ -31,7 +31,7 @@ new_http_archive(
strip_prefix = "OpenCL-Headers-master",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-Headers/OpenCL-Headers-master.zip",
"https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip"
"https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip",
],
)
......@@ -42,7 +42,7 @@ new_http_archive(
strip_prefix = "OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
"https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip"
"https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
],
)
......@@ -53,7 +53,29 @@ new_http_archive(
strip_prefix = "half-code-356-trunk",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-code-356-trunk.zip",
"https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip"
"https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip",
],
)
new_http_archive(
name = "eigen",
build_file = "third_party/eigen3/eigen.BUILD",
sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
strip_prefix = "eigen-eigen-f3a22f35b044",
urls = [
"http://cnbj1.fds.api.xiaomi.com/mace/third-party/eigen/f3a22f35b044.tar.gz",
"http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
"https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
],
)
http_archive(
name = "gemmlowp",
sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
urls = [
"http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
"https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
],
)
......@@ -81,7 +103,7 @@ http_archive(
strip_prefix = "gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e",
urls = [
"https://cnbj1.fds.api.xiaomi.com/mace/third-party/gflags/gflags-30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip",
"https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip"
"https://github.com/gflags/gflags/archive/30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e.zip",
],
)
......
......@@ -18,14 +18,17 @@ cc_library(
],
exclude = [
"*_test.cc",
"*_benchmark.cc",
"arm/*_test.cc",
],
) + if_android(glob([
) + if_android(glob(
[
"opencl/*.cc",
],
exclude = [
"opencl/*_test.cc",
])),
],
)),
hdrs = glob(
[
"*.h",
......@@ -35,16 +38,26 @@ cc_library(
"buffer_to_image.h",
],
) + if_android(glob([
"opencl/*.h",
"buffer_to_image.h",
])),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
"opencl/*.h",
"buffer_to_image.h",
])),
copts = [
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = if_android(["-lm"]),
deps = [
"//mace/core",
......@@ -62,13 +75,22 @@ cc_test(
"opencl/*_test.cc",
],
),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"] +
if_openmp_enabled(["-fopenmp"]) +
if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
if_android_armv7(["-mfpu=neon"]) +
if_android_armv7(["-mfloat-abi=softfp"]) +
if_android(["-DMACE_ENABLE_OPENCL"]) +
if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
copts = [
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
......@@ -77,3 +99,32 @@ cc_test(
"@gtest//:gtest_main",
],
)
cc_test(
name = "kernels_benchmark",
testonly = 1,
srcs = glob(["*_benchmark.cc"]),
copts = [
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfloat-abi=softfp",
]) + if_android([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
":kernels",
"//mace/core:test_benchmark_main",
"//third_party/eigen3",
],
)
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <Eigen/Dense>
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/gemm.h"
#include "public/gemmlowp.h"
namespace mace {
namespace kernels {
namespace test {
// Test the speed of different access order of a NHWC buffer
namespace {
// Matmul with (m, k) x (k, n)
void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
mace::testing::StopTiming();
std::vector<float> lhs(m * k);
std::vector<float> rhs(k * n);
std::vector<float> result(m * n);
// warm up
Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
mace::testing::StartTiming();
while (iters--) {
Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
}
}
void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
mace::testing::StopTiming();
Eigen::MatrixXd lhs = Eigen::MatrixXd::Random(m, k);
Eigen::MatrixXd rhs = Eigen::MatrixXd::Random(k, n);
Eigen::MatrixXd result = Eigen::MatrixXd::Zero(m, n);
// warm up
result = lhs * rhs;
mace::testing::StartTiming();
while (iters--) {
result = lhs * rhs;
}
}
} // namespace
#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC) \
static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * M * K * N; \
const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot * sizeof(float)); \
MatmulBenchmark_##FUNC(iters, M, K, N); \
} \
MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
#define MACE_BM_MATMUL(M, K, N) \
MACE_BM_MATMUL_FUNC(M, K, N, Mace); \
MACE_BM_MATMUL_FUNC(M, K, N, Eigen);
// Embedding size 384
MACE_BM_MATMUL(7, 384, 384);
MACE_BM_MATMUL(7, 384, 1536);
MACE_BM_MATMUL(7, 1536, 384);
MACE_BM_MATMUL(15, 384, 384);
MACE_BM_MATMUL(15, 384, 1536);
MACE_BM_MATMUL(15, 1536, 384);
MACE_BM_MATMUL(1, 384, 384);
MACE_BM_MATMUL(1, 384, 1536);
MACE_BM_MATMUL(1, 1536, 384);
MACE_BM_MATMUL(1, 384, 44678);
// Embedding size 128
MACE_BM_MATMUL(1, 128, 1536);
MACE_BM_MATMUL(1, 128, 44678);
} // namespace test
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/testing/test_benchmark.h"
namespace mace {
namespace kernels {
namespace test {
// Test the speed of different access order of a NHWC buffer
namespace {
void MemoryAccessBenchmark_NHWC(
int iters, int batch, int height, int width, int channels) {
mace::testing::StopTiming();
std::vector<float> buffer(batch * height * width * channels);
std::fill_n(buffer.begin(), buffer.size(), 0.1);
mace::testing::StartTiming();
while (iters--) {
for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
buffer[n * height * width * channels + h * width * channels +
w * channels + c] = 1.0f;
}
}
}
}
}
}
void MemoryAccessBenchmark_NWCH(
int iters, int batch, int height, int width, int channels) {
mace::testing::StopTiming();
std::vector<float> buffer(batch * height * width * channels);
std::fill_n(buffer.begin(), buffer.size(), 0.1);
mace::testing::StartTiming();
while (iters--) {
for (int n = 0; n < batch; ++n) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) {
buffer[n * height * width * channels + h * width * channels +
w * channels + c] = 1.0f;
}
}
}
}
}
}
void MemoryAccessBenchmark_NHCW(
int iters, int batch, int height, int width, int channels) {
mace::testing::StopTiming();
std::vector<float> buffer(batch * height * width * channels);
std::fill_n(buffer.begin(), buffer.size(), 0.1);
mace::testing::StartTiming();
while (iters--) {
for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) {
for (int c = 0; c < channels; ++c) {
for (int w = 0; w < width; ++w) {
buffer[n * height * width * channels + h * width * channels +
w * channels + c] = 1.0f;
}
}
}
}
}
}
} // namespace
#define MACE_BM_MEMORY_ACCESS(N, H, W, C, ORDER) \
static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot * sizeof(float)); \
MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \
} \
MACE_BENCHMARK(MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER)
MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHWC);
MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NHCW);
MACE_BM_MEMORY_ACCESS(10, 64, 64, 1024, NWCH);
MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NHCW);
MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NWCH);
} // namespace test
} // namespace kernels
} // namespace mace
# Description:
# Eigen is a C++ template library for linear algebra: vectors,
# matrices, and related algorithms.
# This file is mostly stolen from tensorflow.
licenses([
# Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
# We've taken special care to not reference any restricted code.
"reciprocal", # MPL2
"notice", # Portions BSD
])
exports_files(["LICENSE"])
cc_library(
name = "eigen3",
hdrs = glob(["unsupported/Eigen/CXX11/src/FixedPoint/*.h"]) + [
"Eigen/Core",
"Eigen/LU",
"Eigen/Cholesky",
"Eigen/Eigenvalues",
"Eigen/QR",
"Eigen/SVD",
"unsupported/Eigen/SpecialFunctions",
"unsupported/Eigen/CXX11/ThreadPool",
"unsupported/Eigen/CXX11/Tensor",
"unsupported/Eigen/CXX11/FixedPoint",
],
visibility = ["//visibility:public"],
deps = [
"@eigen//:eigen",
],
)
#include "Eigen/Eigenvalues"
Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
http://www.mozilla.org/MPL/2.0/
http://www.mozilla.org/MPL/2.0/FAQ.html
Some files contain third-party code under BSD or LGPL licenses, whence
the other COPYING.* files here.
All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later.
For this reason, the COPYING.LGPL file contains the LGPL 2.1 text.
If you want to guarantee that the Eigen code that you are #including
is licensed under the MPL2 and possibly more permissive licenses (like
BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY
For example, with most compilers, you could add this to your project
CXXFLAGS: -DEIGEN_MPL2_ONLY
This will cause a compilation error to be generated if you #include
any code that is LGPL licensed.
----------------------------------------------------------------------
Following applies to:
./test/mapstaticmethods.cpp
./test/schur_real.cpp
./test/prec_inverse_4x4.cpp
./test/smallvectors.cpp
./test/redux.cpp
./test/special_numbers.cpp
./test/adjoint.cpp
./test/resize.cpp
./test/mixingtypes.cpp
./test/product_trmv.cpp
./test/sparse_solvers.cpp
./test/cholesky.cpp
./test/geo_quaternion.cpp
./test/miscmatrices.cpp
./test/stddeque.cpp
./test/integer_types.cpp
./test/product_large.cpp
./test/eigensolver_generic.cpp
./test/householder.cpp
./test/geo_orthomethods.cpp
./test/array_for_matrix.cpp
./test/sparseLM.cpp
./test/upperbidiagonalization.cpp
./test/nomalloc.cpp
./test/packetmath.cpp
./test/jacobisvd.cpp
./test/geo_transformations.cpp
./test/swap.cpp
./test/eigensolver_selfadjoint.cpp
./test/inverse.cpp
./test/product_selfadjoint.cpp
./test/product_trsolve.cpp
./test/product_extra.cpp
./test/sparse_solver.h
./test/mapstride.cpp
./test/mapped_matrix.cpp
./test/geo_eulerangles.cpp
./test/eigen2support.cpp
./test/denseLM.cpp
./test/stdvector.cpp
./test/nesting_ops.cpp
./test/sparse_permutations.cpp
./test/zerosized.cpp
./test/exceptions.cpp
./test/vectorwiseop.cpp
./test/cwiseop.cpp
./test/basicstuff.cpp
./test/product_trmm.cpp
./test/linearstructure.cpp
./test/sparse_product.cpp
./test/stdvector_overload.cpp
./test/stable_norm.cpp
./test/umeyama.cpp
./test/unalignedcount.cpp
./test/triangular.cpp
./test/product_mmtr.cpp
./test/sparse_basic.cpp
./test/sparse_vector.cpp
./test/meta.cpp
./test/real_qz.cpp
./test/ref.cpp
./test/eigensolver_complex.cpp
./test/cholmod_support.cpp
./test/conjugate_gradient.cpp
./test/sparse.h
./test/simplicial_cholesky.cpp
./test/bicgstab.cpp
./test/dynalloc.cpp
./test/product_notemporary.cpp
./test/geo_hyperplane.cpp
./test/lu.cpp
./test/qr.cpp
./test/hessenberg.cpp
./test/sizeof.cpp
./test/main.h
./test/selfadjoint.cpp
./test/permutationmatrices.cpp
./test/superlu_support.cpp
./test/qtvector.cpp
./test/geo_homogeneous.cpp
./test/determinant.cpp
./test/array_reverse.cpp
./test/unalignedassert.cpp
./test/stdlist.cpp
./test/product_symm.cpp
./test/corners.cpp
./test/dontalign.cpp
./test/visitor.cpp
./test/geo_alignedbox.cpp
./test/diagonalmatrices.cpp
./test/product_small.cpp
./test/eigensolver_generalized_real.cpp
./test/umfpack_support.cpp
./test/first_aligned.cpp
./test/qr_fullpivoting.cpp
./test/array_replicate.cpp
./test/geo_parametrizedline.cpp
./test/eigen2/eigen2_unalignedassert.cpp
./test/eigen2/eigen2_prec_inverse_4x4.cpp
./test/eigen2/eigen2_alignedbox.cpp
./test/eigen2/eigen2_sparse_product.cpp
./test/eigen2/eigen2_meta.cpp
./test/eigen2/eigen2_nomalloc.cpp
./test/eigen2/eigen2_visitor.cpp
./test/eigen2/eigen2_packetmath.cpp
./test/eigen2/eigen2_svd.cpp
./test/eigen2/eigen2_mixingtypes.cpp
./test/eigen2/eigen2_qr.cpp
./test/eigen2/eigen2_cwiseop.cpp
./test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
./test/eigen2/eigen2_smallvectors.cpp
./test/eigen2/eigen2_commainitializer.cpp
./test/eigen2/eigen2_sparse_solvers.cpp
./test/eigen2/eigen2_hyperplane.cpp
./test/eigen2/eigen2_eigensolver.cpp
./test/eigen2/eigen2_linearstructure.cpp
./test/eigen2/eigen2_sizeof.cpp
./test/eigen2/eigen2_parametrizedline.cpp
./test/eigen2/eigen2_lu.cpp
./test/eigen2/eigen2_adjoint.cpp
./test/eigen2/eigen2_geometry.cpp
./test/eigen2/eigen2_stdvector.cpp
./test/eigen2/eigen2_newstdvector.cpp
./test/eigen2/eigen2_submatrices.cpp
./test/eigen2/sparse.h
./test/eigen2/eigen2_swap.cpp
./test/eigen2/eigen2_triangular.cpp
./test/eigen2/eigen2_basicstuff.cpp
./test/eigen2/gsl_helper.h
./test/eigen2/eigen2_dynalloc.cpp
./test/eigen2/eigen2_array.cpp
./test/eigen2/eigen2_map.cpp
./test/eigen2/main.h
./test/eigen2/eigen2_miscmatrices.cpp
./test/eigen2/eigen2_product_large.cpp
./test/eigen2/eigen2_first_aligned.cpp
./test/eigen2/eigen2_cholesky.cpp
./test/eigen2/eigen2_determinant.cpp
./test/eigen2/eigen2_sum.cpp
./test/eigen2/eigen2_inverse.cpp
./test/eigen2/eigen2_regression.cpp
./test/eigen2/eigen2_product_small.cpp
./test/eigen2/eigen2_qtvector.cpp
./test/eigen2/eigen2_sparse_vector.cpp
./test/eigen2/product.h
./test/eigen2/eigen2_sparse_basic.cpp
./test/eigen2/eigen2_bug_132.cpp
./test/array.cpp
./test/product_syrk.cpp
./test/commainitializer.cpp
./test/conservative_resize.cpp
./test/qr_colpivoting.cpp
./test/nullary.cpp
./test/bandmatrix.cpp
./test/pastix_support.cpp
./test/product.h
./test/block.cpp
./test/vectorization_logic.cpp
./test/jacobi.cpp
./test/diagonal.cpp
./test/schur_complex.cpp
./test/sizeoverflow.cpp
./bench/BenchTimer.h
./bench/benchFFT.cpp
./bench/eig33.cpp
./bench/spbench/spbenchsolver.h
./bench/spbench/spbenchstyle.h
./lapack/complex_double.cpp
./lapack/cholesky.cpp
./lapack/lapack_common.h
./lapack/eigenvalues.cpp
./lapack/single.cpp
./lapack/lu.cpp
./lapack/complex_single.cpp
./lapack/double.cpp
./demos/mix_eigen_and_c/binary_library.cpp
./demos/mix_eigen_and_c/binary_library.h
./demos/mix_eigen_and_c/example.c
./demos/mandelbrot/mandelbrot.cpp
./demos/mandelbrot/mandelbrot.h
./demos/opengl/icosphere.cpp
./demos/opengl/icosphere.h
./demos/opengl/camera.cpp
./demos/opengl/quaternion_demo.h
./demos/opengl/camera.h
./demos/opengl/trackball.h
./demos/opengl/gpuhelper.h
./demos/opengl/trackball.cpp
./demos/opengl/gpuhelper.cpp
./demos/opengl/quaternion_demo.cpp
./debug/gdb/printers.py
./unsupported/test/minres.cpp
./unsupported/test/openglsupport.cpp
./unsupported/test/jacobisvd.cpp
./unsupported/test/dgmres.cpp
./unsupported/test/matrix_square_root.cpp
./unsupported/test/bdcsvd.cpp
./unsupported/test/matrix_exponential.cpp
./unsupported/test/forward_adolc.cpp
./unsupported/test/polynomialsolver.cpp
./unsupported/test/matrix_function.cpp
./unsupported/test/sparse_extra.cpp
./unsupported/test/matrix_functions.h
./unsupported/test/svd_common.h
./unsupported/test/FFTW.cpp
./unsupported/test/alignedvector3.cpp
./unsupported/test/autodiff.cpp
./unsupported/test/gmres.cpp
./unsupported/test/BVH.cpp
./unsupported/test/levenberg_marquardt.cpp
./unsupported/test/matrix_power.cpp
./unsupported/test/kronecker_product.cpp
./unsupported/test/splines.cpp
./unsupported/test/polynomialutils.cpp
./unsupported/bench/bench_svd.cpp
./unsupported/Eigen/IterativeSolvers
./unsupported/Eigen/src/IterativeSolvers/DGMRES.h
./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
./unsupported/Eigen/src/IterativeSolvers/GMRES.h
./unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
./unsupported/Eigen/src/IterativeSolvers/Scaling.h
./unsupported/Eigen/src/IterativeSolvers/MINRES.h
./unsupported/Eigen/src/SparseExtra/RandomSetter.h
./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
./unsupported/Eigen/src/SparseExtra/MarketIO.h
./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
./unsupported/Eigen/src/BVH/BVAlgorithms.h
./unsupported/Eigen/src/BVH/KdBVH.h
./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
./unsupported/Eigen/src/Splines/Spline.h
./unsupported/Eigen/src/Splines/SplineFitting.h
./unsupported/Eigen/src/Splines/SplineFwd.h
./unsupported/Eigen/src/SVD/JacobiSVD.h
./unsupported/Eigen/src/SVD/BDCSVD.h
./unsupported/Eigen/src/SVD/SVDBase.h
./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
./unsupported/Eigen/src/MatrixFunctions/StemFunction.h
./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
./unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
./unsupported/Eigen/src/MoreVectorization/MathFunctions.h
./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
./unsupported/Eigen/src/FFT/ei_fftw_impl.h
./unsupported/Eigen/src/FFT/ei_kissfft_impl.h
./unsupported/Eigen/src/Polynomials/PolynomialSolver.h
./unsupported/Eigen/src/Polynomials/Companion.h
./unsupported/Eigen/src/Polynomials/PolynomialUtils.h
./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
./unsupported/Eigen/src/Skyline/SkylineProduct.h
./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
./unsupported/Eigen/src/Skyline/SkylineStorage.h
./unsupported/Eigen/src/Skyline/SkylineUtil.h
./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
./unsupported/Eigen/src/Skyline/SkylineMatrix.h
./unsupported/Eigen/SparseExtra
./unsupported/Eigen/AdolcForward
./unsupported/Eigen/KroneckerProduct
./unsupported/Eigen/NonLinearOptimization
./unsupported/Eigen/BVH
./unsupported/Eigen/OpenGLSupport
./unsupported/Eigen/ArpackSupport
./unsupported/Eigen/AutoDiff
./unsupported/Eigen/Splines
./unsupported/Eigen/MPRealSupport
./unsupported/Eigen/MatrixFunctions
./unsupported/Eigen/MoreVectorization
./unsupported/Eigen/LevenbergMarquardt
./unsupported/Eigen/AlignedVector3
./unsupported/Eigen/FFT
./unsupported/Eigen/Polynomials
./unsupported/Eigen/NumericalDiff
./unsupported/Eigen/Skyline
./COPYING.README
./COPYING.README
./LICENSE
./LICENSE
./LICENSE
./Eigen/Eigen2Support
./Eigen/src/Eigen2Support/VectorBlock.h
./Eigen/src/Eigen2Support/Cwise.h
./Eigen/src/Eigen2Support/Minor.h
./Eigen/src/Eigen2Support/Lazy.h
./Eigen/src/Eigen2Support/Memory.h
./Eigen/src/Eigen2Support/MathFunctions.h
./Eigen/src/Eigen2Support/Geometry/AlignedBox.h
./Eigen/src/Eigen2Support/Geometry/Hyperplane.h
./Eigen/src/Eigen2Support/Geometry/Quaternion.h
./Eigen/src/Eigen2Support/Geometry/Rotation2D.h
./Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
./Eigen/src/Eigen2Support/Geometry/RotationBase.h
./Eigen/src/Eigen2Support/Geometry/Translation.h
./Eigen/src/Eigen2Support/Geometry/Scaling.h
./Eigen/src/Eigen2Support/Geometry/AngleAxis.h
./Eigen/src/Eigen2Support/Geometry/Transform.h
./Eigen/src/Eigen2Support/TriangularSolver.h
./Eigen/src/Eigen2Support/LU.h
./Eigen/src/Eigen2Support/QR.h
./Eigen/src/Eigen2Support/SVD.h
./Eigen/src/Eigen2Support/Meta.h
./Eigen/src/Eigen2Support/Block.h
./Eigen/src/Eigen2Support/Macros.h
./Eigen/src/Eigen2Support/LeastSquares.h
./Eigen/src/Eigen2Support/CwiseOperators.h
./Eigen/src/Jacobi/Jacobi.h
./Eigen/src/misc/Kernel.h
./Eigen/src/misc/SparseSolve.h
./Eigen/src/misc/Solve.h
./Eigen/src/misc/Image.h
./Eigen/src/SparseCore/SparseColEtree.h
./Eigen/src/SparseCore/SparseTranspose.h
./Eigen/src/SparseCore/SparseUtil.h
./Eigen/src/SparseCore/SparseCwiseBinaryOp.h
./Eigen/src/SparseCore/SparseDiagonalProduct.h
./Eigen/src/SparseCore/SparseProduct.h
./Eigen/src/SparseCore/SparseDot.h
./Eigen/src/SparseCore/SparseCwiseUnaryOp.h
./Eigen/src/SparseCore/SparseSparseProductWithPruning.h
./Eigen/src/SparseCore/SparseBlock.h
./Eigen/src/SparseCore/SparseDenseProduct.h
./Eigen/src/SparseCore/CompressedStorage.h
./Eigen/src/SparseCore/SparseMatrixBase.h
./Eigen/src/SparseCore/MappedSparseMatrix.h
./Eigen/src/SparseCore/SparseTriangularView.h
./Eigen/src/SparseCore/SparseView.h
./Eigen/src/SparseCore/SparseFuzzy.h
./Eigen/src/SparseCore/TriangularSolver.h
./Eigen/src/SparseCore/SparseSelfAdjointView.h
./Eigen/src/SparseCore/SparseMatrix.h
./Eigen/src/SparseCore/SparseVector.h
./Eigen/src/SparseCore/AmbiVector.h
./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
./Eigen/src/SparseCore/SparseRedux.h
./Eigen/src/SparseCore/SparsePermutation.h
./Eigen/src/Eigenvalues/RealSchur.h
./Eigen/src/Eigenvalues/ComplexEigenSolver.h
./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
./Eigen/src/Eigenvalues/ComplexSchur.h
./Eigen/src/Eigenvalues/RealQZ.h
./Eigen/src/Eigenvalues/EigenSolver.h
./Eigen/src/Eigenvalues/HessenbergDecomposition.h
./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
./Eigen/src/Eigenvalues/Tridiagonalization.h
./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
./Eigen/src/SuperLUSupport/SuperLUSupport.h
./Eigen/src/StlSupport/StdDeque.h
./Eigen/src/StlSupport/StdVector.h
./Eigen/src/StlSupport/StdList.h
./Eigen/src/StlSupport/details.h
./Eigen/src/SparseQR/SparseQR.h
./Eigen/src/LU/Inverse.h
./Eigen/src/LU/arch/Inverse_SSE.h
./Eigen/src/LU/Determinant.h
./Eigen/src/LU/PartialPivLU.h
./Eigen/src/LU/FullPivLU.h
./Eigen/src/UmfPackSupport/UmfPackSupport.h
./Eigen/src/OrderingMethods/Ordering.h
./Eigen/src/OrderingMethods/Eigen_Colamd.h
./Eigen/src/QR/HouseholderQR.h
./Eigen/src/QR/ColPivHouseholderQR.h
./Eigen/src/QR/FullPivHouseholderQR.h
./Eigen/src/SVD/JacobiSVD.h
./Eigen/src/SVD/UpperBidiagonalization.h
./Eigen/src/Geometry/OrthoMethods.h
./Eigen/src/Geometry/AlignedBox.h
./Eigen/src/Geometry/Hyperplane.h
./Eigen/src/Geometry/Quaternion.h
./Eigen/src/Geometry/EulerAngles.h
./Eigen/src/Geometry/Rotation2D.h
./Eigen/src/Geometry/ParametrizedLine.h
./Eigen/src/Geometry/RotationBase.h
./Eigen/src/Geometry/arch/Geometry_SSE.h
./Eigen/src/Geometry/Umeyama.h
./Eigen/src/Geometry/Homogeneous.h
./Eigen/src/Geometry/Translation.h
./Eigen/src/Geometry/Scaling.h
./Eigen/src/Geometry/AngleAxis.h
./Eigen/src/Geometry/Transform.h
./Eigen/src/plugins/BlockMethods.h
./Eigen/src/plugins/CommonCwiseUnaryOps.h
./Eigen/src/plugins/CommonCwiseBinaryOps.h
./Eigen/src/plugins/MatrixCwiseUnaryOps.h
./Eigen/src/plugins/MatrixCwiseBinaryOps.h
./Eigen/src/Householder/Householder.h
./Eigen/src/Householder/HouseholderSequence.h
./Eigen/src/Householder/BlockHouseholder.h
./Eigen/src/Core/VectorBlock.h
./Eigen/src/Core/Matrix.h
./Eigen/src/Core/Ref.h
./Eigen/src/Core/SelfAdjointView.h
./Eigen/src/Core/MathFunctions.h
./Eigen/src/Core/GlobalFunctions.h
./Eigen/src/Core/MapBase.h
./Eigen/src/Core/EigenBase.h
./Eigen/src/Core/GenericPacketMath.h
./Eigen/src/Core/NestByValue.h
./Eigen/src/Core/CwiseUnaryOp.h
./Eigen/src/Core/SolveTriangular.h
./Eigen/src/Core/Fuzzy.h
./Eigen/src/Core/Visitor.h
./Eigen/src/Core/Map.h
./Eigen/src/Core/NoAlias.h
./Eigen/src/Core/Diagonal.h
./Eigen/src/Core/StableNorm.h
./Eigen/src/Core/CoreIterators.h
./Eigen/src/Core/products/Parallelizer.h
./Eigen/src/Core/products/SelfadjointMatrixVector.h
./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
./Eigen/src/Core/products/TriangularSolverMatrix.h
./Eigen/src/Core/products/GeneralMatrixMatrix.h
./Eigen/src/Core/products/SelfadjointProduct.h
./Eigen/src/Core/products/CoeffBasedProduct.h
./Eigen/src/Core/products/TriangularMatrixVector.h
./Eigen/src/Core/products/SelfadjointMatrixMatrix.h
./Eigen/src/Core/products/TriangularSolverVector.h
./Eigen/src/Core/products/SelfadjointRank2Update.h
./Eigen/src/Core/products/GeneralBlockPanelKernel.h
./Eigen/src/Core/products/GeneralMatrixVector.h
./Eigen/src/Core/products/TriangularMatrixMatrix.h
./Eigen/src/Core/Reverse.h
./Eigen/src/Core/BooleanRedux.h
./Eigen/src/Core/Replicate.h
./Eigen/src/Core/arch/AltiVec/PacketMath.h
./Eigen/src/Core/arch/AltiVec/Complex.h
./Eigen/src/Core/arch/SSE/PacketMath.h
./Eigen/src/Core/arch/SSE/Complex.h
./Eigen/src/Core/arch/SSE/MathFunctions.h
./Eigen/src/Core/arch/NEON/PacketMath.h
./Eigen/src/Core/arch/NEON/Complex.h
./Eigen/src/Core/arch/Default/Settings.h
./Eigen/src/Core/CwiseUnaryView.h
./Eigen/src/Core/Array.h
./Eigen/src/Core/ArrayWrapper.h
./Eigen/src/Core/Swap.h
./Eigen/src/Core/Transpositions.h
./Eigen/src/Core/Random.h
./Eigen/src/Core/IO.h
./Eigen/src/Core/SelfCwiseBinaryOp.h
./Eigen/src/Core/VectorwiseOp.h
./Eigen/src/Core/Select.h
./Eigen/src/Core/ArrayBase.h
./Eigen/src/Core/DenseCoeffsBase.h
./Eigen/src/Core/DiagonalProduct.h
./Eigen/src/Core/Assign.h
./Eigen/src/Core/Redux.h
./Eigen/src/Core/ForceAlignedAccess.h
./Eigen/src/Core/BandMatrix.h
./Eigen/src/Core/PlainObjectBase.h
./Eigen/src/Core/DenseBase.h
./Eigen/src/Core/Flagged.h
./Eigen/src/Core/CwiseBinaryOp.h
./Eigen/src/Core/ProductBase.h
./Eigen/src/Core/TriangularMatrix.h
./Eigen/src/Core/Transpose.h
./Eigen/src/Core/DiagonalMatrix.h
./Eigen/src/Core/Dot.h
./Eigen/src/Core/Functors.h
./Eigen/src/Core/PermutationMatrix.h
./Eigen/src/Core/NumTraits.h
./Eigen/src/Core/MatrixBase.h
./Eigen/src/Core/DenseStorage.h
./Eigen/src/Core/util/Memory.h
./Eigen/src/Core/util/StaticAssert.h
./Eigen/src/Core/util/BlasUtil.h
./Eigen/src/Core/util/MatrixMapper.h
./Eigen/src/Core/util/XprHelper.h
./Eigen/src/Core/util/ForwardDeclarations.h
./Eigen/src/Core/util/Meta.h
./Eigen/src/Core/util/Macros.h
./Eigen/src/Core/util/Constants.h
./Eigen/src/Core/CwiseNullaryOp.h
./Eigen/src/Core/Block.h
./Eigen/src/Core/GeneralProduct.h
./Eigen/src/Core/CommaInitializer.h
./Eigen/src/Core/ReturnByValue.h
./Eigen/src/Core/Stride.h
./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
./Eigen/src/SparseLU/SparseLU_column_dfs.h
./Eigen/src/SparseLU/SparseLU_panel_dfs.h
./Eigen/src/SparseLU/SparseLU_relax_snode.h
./Eigen/src/SparseLU/SparseLU_panel_bmod.h
./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
./Eigen/src/SparseLU/SparseLU_Utils.h
./Eigen/src/SparseLU/SparseLU_gemm_kernel.h
./Eigen/src/SparseLU/SparseLU_kernel_bmod.h
./Eigen/src/SparseLU/SparseLU_pivotL.h
./Eigen/src/SparseLU/SparseLU_Memory.h
./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
./Eigen/src/SparseLU/SparseLUImpl.h
./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
./Eigen/src/SparseLU/SparseLU_Structs.h
./Eigen/src/SparseLU/SparseLU.h
./Eigen/src/SparseLU/SparseLU_column_bmod.h
./Eigen/src/SparseLU/SparseLU_pruneL.h
./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
./Eigen/src/SparseCholesky/SimplicialCholesky.h
./Eigen/src/Cholesky/LDLT.h
./Eigen/src/Cholesky/LLT.h
./Eigen/src/CholmodSupport/CholmodSupport.h
./Eigen/src/PaStiXSupport/PaStiXSupport.h
./Eigen/src/MetisSupport/MetisSupport.h
./Eigen/StdVector
./Eigen/Core
./Eigen/SparseLU
./Eigen/StdList
./Eigen/StdDeque
./Eigen/SparseCholesky
./scripts/relicense.py
./scripts/relicense.py
./blas/BandTriangularSolver.h
./blas/PackedTriangularMatrixVector.h
./blas/complex_double.cpp
./blas/level2_real_impl.h
./blas/level1_cplx_impl.h
./blas/level1_impl.h
./blas/level1_real_impl.h
./blas/level3_impl.h
./blas/single.cpp
./blas/level2_cplx_impl.h
./blas/PackedSelfadjointProduct.h
./blas/Rank2Update.h
./blas/complex_single.cpp
./blas/PackedTriangularSolverVector.h
./blas/double.cpp
./blas/common.h
./blas/level2_impl.h
./blas/GeneralRank1Update.h
Mozilla Public License Version 2.0
==================================
1. Definitions
--------------
1.1. "Contributor"
means each individual or legal entity that creates, contributes to
the creation of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used
by a Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached
the notice in Exhibit A, the Executable Form of such Source Code
Form, and Modifications of such Source Code Form, in each case
including portions thereof.
1.5. "Incompatible With Secondary Licenses"
means
(a) that the initial Contributor has attached the notice described
in Exhibit B to the Covered Software; or
(b) that the Covered Software was made available under the terms of
version 1.1 or earlier of the License, but not also under the
terms of a Secondary License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in
a separate file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible,
whether at the time of the initial grant or subsequently, any and
all of the rights conveyed by this License.
1.10. "Modifications"
means any of the following:
(a) any file in Source Code Form that results from an addition to,
deletion from, or modification of the contents of Covered
Software; or
(b) any new file in Source Code Form that contains any Covered
Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method,
process, and apparatus claims, in any patent Licensable by such
Contributor that would be infringed, but for the grant of the
License, by the making, using, selling, offering for sale, having
made, import, or transfer of either its Contributions or its
Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU
Lesser General Public License, Version 2.1, the GNU Affero General
Public License, Version 3.0, or any later versions of those
licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this
License. For legal entities, "You" includes any entity that
controls, is controlled by, or is under common control with You. For
purposes of this definition, "control" means (a) the power, direct
or indirect, to cause the direction or management of such entity,
whether by contract or otherwise, or (b) ownership of more than
fifty percent (50%) of the outstanding shares or beneficial
ownership of such entity.
2. License Grants and Conditions
--------------------------------
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or
as part of a Larger Work; and
(b) under Patent Claims of such Contributor to make, use, sell, offer
for sale, have made, import, and otherwise transfer either its
Contributions or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:
(a) for any code that a Contributor has removed from Covered Software;
or
(b) for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
(c) under Patent Claims infringed by Covered Software in the absence of
its Contributions.
This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.
3. Responsibilities
-------------------
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
(a) such Covered Software must also be made available in Source Code
Form, as described in Section 3.1, and You must inform recipients of
the Executable Form how they can obtain a copy of such Source Code
Form by reasonable means in a timely manner, at a charge no more
than the cost of distribution to the recipient; and
(b) You may distribute such Executable Form under the terms of this
License, or sublicense it under different terms, provided that the
license for the Executable Form does not attempt to limit or alter
the recipients' rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).
3.4. Notices
You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.
5. Termination
--------------
5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.
************************************************************************
* *
* 6. Disclaimer of Warranty *
* ------------------------- *
* *
* Covered Software is provided under this License on an "as is" *
* basis, without warranty of any kind, either expressed, implied, or *
* statutory, including, without limitation, warranties that the *
* Covered Software is free of defects, merchantable, fit for a *
* particular purpose or non-infringing. The entire risk as to the *
* quality and performance of the Covered Software is with You. *
* Should any Covered Software prove defective in any respect, You *
* (not any Contributor) assume the cost of any necessary servicing, *
* repair, or correction. This disclaimer of warranty constitutes an *
* essential part of this License. No use of any Covered Software is *
* authorized under this License except under this disclaimer. *
* *
************************************************************************
************************************************************************
* *
* 7. Limitation of Liability *
* -------------------------- *
* *
* Under no circumstances and under no legal theory, whether tort *
* (including negligence), contract, or otherwise, shall any *
* Contributor, or anyone who distributes Covered Software as *
* permitted above, be liable to You for any direct, indirect, *
* special, incidental, or consequential damages of any character *
* including, without limitation, damages for lost profits, loss of *
* goodwill, work stoppage, computer failure or malfunction, or any *
* and all other commercial damages or losses, even if such party *
* shall have been informed of the possibility of such damages. This *
* limitation of liability shall not apply to liability for death or *
* personal injury resulting from such party's negligence to the *
* extent applicable law prohibits such limitation. Some *
* jurisdictions do not allow the exclusion or limitation of *
* incidental or consequential damages, so this exclusion and *
* limitation may not apply to You. *
* *
************************************************************************
8. Litigation
-------------
Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.
9. Miscellaneous
----------------
This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.
10. Versions of the License
---------------------------
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
-------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------
This Source Code Form is "Incompatible With Secondary Licenses", as
defined by the Mozilla Public License, v. 2.0.
----------------------------------------------------------------------
Following applies to:
./doc/UsingIntelMKL.dox
./doc/UsingIntelMKL.dox
./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
./Eigen/src/Eigenvalues/RealSchur_MKL.h
./Eigen/src/Eigenvalues/RealSchur_MKL.h
./Eigen/src/LU/arch/Inverse_SSE.h
./Eigen/src/LU/arch/Inverse_SSE.h
./Eigen/src/LU/PartialPivLU_MKL.h
./Eigen/src/LU/PartialPivLU_MKL.h
./Eigen/src/QR/HouseholderQR_MKL.h
./Eigen/src/QR/HouseholderQR_MKL.h
./Eigen/src/QR/ColPivHouseholderQR_MKL.h
./Eigen/src/QR/ColPivHouseholderQR_MKL.h
./Eigen/src/SVD/JacobiSVD_MKL.h
./Eigen/src/SVD/JacobiSVD_MKL.h
./Eigen/src/PardisoSupport/PardisoSupport.h
./Eigen/src/PardisoSupport/PardisoSupport.h
./Eigen/src/Core/Assign_MKL.h
./Eigen/src/Core/Assign_MKL.h
./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
./Eigen/src/Core/util/MKL_support.h
./Eigen/src/Core/util/MKL_support.h
./Eigen/src/Cholesky/LLT_MKL.h
./Eigen/src/Cholesky/LLT_MKL.h
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. *
Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution. * Neither the name of Intel Corporation nor the
names of its contributors may be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
----------------------------------------------------------------------
Following applies to:
everything under ./bench/btl
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds
of works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal,
family, or household purposes, or (2) anything designed or sold for
incorporation into a dwelling. In determining whether a product is a
consumer product, doubtful cases shall be resolved in favor of
coverage. For a particular product received by a particular user,
"normally used" refers to a typical or common use of that class of
product, regardless of the status of the particular user or of the way
in which the particular user actually uses, or expects or is expected
to use, the product. A product is a consumer product regardless of
whether the product has substantial commercial, industrial or
non-consumer uses, unless such uses represent the only significant
mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to
install and execute modified versions of a covered work in that User
Product from a modified version of its Corresponding Source. The
information must suffice to ensure that the continued functioning of
the modified object code is in no case prevented or interfered with
solely because modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include
a requirement to continue to provide support service, warranty, or
updates for a work that has been modified or installed by the
recipient, or for the User Product in which it has been modified or
installed. Access to a network may be denied when the modification
itself materially and adversely affects the operation of the network
or violates the rules and protocols for communication across the
network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material
you add to a covered work, you may (if authorized by the copyright
holders of that material) supplement the terms of this License with
terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions
of it) with contractual assumptions of liability to the recipient,
for any liability that these contractual assumptions directly
impose on those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement
or otherwise) that contradict the conditions of this License, they do
not excuse you from the conditions of this License. If you cannot
convey a covered work so as to satisfy simultaneously your obligations
under this License and any other pertinent obligations, then as a
consequence you may not convey it at all. For example, if you agree
to terms that obligate you to collect a royalty for further conveying
from those to whom you convey the Program, the only way you could
satisfy both those terms and this License would be to refrain entirely
from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions
of the GNU General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT
WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND
PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES
AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER
OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these
terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it
does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
<http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author> This program comes
with ABSOLUTELY NO WARRANTY; for details type `show w'. This is
free software, and you are welcome to redistribute it under
certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the
appropriate parts of the General Public License. Of course, your
program's commands might be different; for a GUI interface, you would
use an "about box".
You should also get your employer (if you work as a programmer) or
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. For more information on this, and how to apply and follow
the GNU GPL, see <http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your
program into proprietary programs. If your program is a subroutine
library, you may consider it more useful to permit linking proprietary
applications with the library. If this is what you want to do, use
the GNU Lesser General Public License instead of this License. But
first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.
----------------------------------------------------------------------
Following applies to:
./test/metis_support.cpp
./test/sparselu.cpp
./unsupported/test/mpreal/mpreal.h
./unsupported/Eigen/src/IterativeSolvers/IterationController.h
./unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
./unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
./Eigen/src/OrderingMethods/Amd.h
./Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
GNU LESSER GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.
0. Additional Definitions.
As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the
GNU General Public License.
"The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.
An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.
A "Combined Work" is a work produced by combining or linking an
Application with the Library. The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".
The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.
The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.
1. Exception to Section 3 of the GNU GPL.
You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.
2. Conveying Modified Versions.
If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:
a) under this License, provided that you make a good faith effort to
ensure that, in the event an Application does not supply the
function or data, the facility still operates, and performs
whatever part of its purpose remains meaningful, or
b) under the GNU GPL, with none of the additional permissions of
this License applicable to that copy.
3. Object Code Incorporating Material from Library Header Files.
The object code form of an Application may incorporate material from
a header file that is part of the Library. You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:
a) Give prominent notice with each copy of the object code that the
Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the object code with a copy of the GNU GPL and this
license document.
4. Combined Works.
You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:
a) Give prominent notice with each copy of the Combined Work that
the Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the Combined Work with a copy of the GNU GPL and this
license document.
c) For a Combined Work that displays copyright notices during
execution, include the copyright notice for the Library among
these notices, as well as a reference directing the user to the
copies of the GNU GPL and this license document.
d) Do one of the following:
0) Convey the Minimal Corresponding Source under the terms of
this License, and the Corresponding Application Code in a form
suitable for, and under terms that permit, the user to
recombine or relink the Application with a modified version of
the Linked Version to produce a modified Combined Work, in the
manner specified by section 6 of the GNU GPL for conveying
Corresponding Source.
1) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (a) uses at run time
a copy of the Library already present on the user's computer
system, and (b) will operate properly with a modified version
of the Library that is interface-compatible with the Linked
Version.
e) Provide Installation Information, but only if you would otherwise
be required to provide such information under section 6 of the
GNU GPL, and only to the extent that such information is
necessary to install and execute a modified version of the
Combined Work produced by recombining or relinking the
Application with a modified version of the Linked Version. (If
you use option 4d0, the Installation Information must accompany
the Minimal Corresponding Source and Corresponding Application
Code. If you use option 4d1, you must provide the Installation
Information in the manner specified by section 6 of the GNU GPL
for conveying Corresponding Source.)
5. Combined Libraries.
You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:
a) Accompany the combined library with a copy of the same work based
on the Library, uncombined with any other library facilities,
conveyed under the terms of this License.
b) Give prominent notice with the combined library that part of it
is a work based on the Library, and explaining where to find the
accompanying uncombined form of the same work.
6. Revised Versions of the GNU Lesser General Public License.
The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.
If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.
----------------------------------------------------------------------
Following applies to:
./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
Minpack Copyright Notice (1999) University of Chicago. All rights
reserved
Redistribution and use in source and binary forms, with or
without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the following
disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials
provided with the distribution.
3. The end-user documentation included with the
redistribution, if any, must include the following
acknowledgment:
"This product includes software developed by the
University of Chicago, as Operator of Argonne National
Laboratory.
Alternately, this acknowledgment may appear in the software
itself, if and wherever such third-party acknowledgments
normally appear.
4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
BE CORRECTED.
5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
POSSIBILITY OF SUCH LOSS OR DAMAGES.
# Description:
# Eigen is a C++ template library for linear algebra: vectors,
# matrices, and related algorithms.
# This file is mostly stolen from tensorflow.
licenses([
# Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
# We've taken special care to not reference any restricted code.
"reciprocal", # MPL2
"notice", # Portions BSD
])
exports_files(["COPYING.MPL2"])
# License-restricted (i.e. not reciprocal or notice) files inside Eigen/...
EIGEN_RESTRICTED_FILES = [
"Eigen/src/OrderingMethods/Amd.h",
"Eigen/src/SparseCholesky/**",
]
# Notable transitive dependencies of restricted files inside Eigen/...
EIGEN_RESTRICTED_DEPS = [
"Eigen/Eigen",
"Eigen/IterativeLinearSolvers",
"Eigen/MetisSupport",
"Eigen/Sparse",
"Eigen/SparseCholesky",
"Eigen/SparseLU",
]
# Note: unsupported/Eigen is unsupported and might go away at any time.
EIGEN_FILES = [
"Eigen/**",
"unsupported/Eigen/CXX11/**",
"unsupported/Eigen/FFT",
"unsupported/Eigen/KroneckerProduct",
"unsupported/Eigen/src/FFT/**",
"unsupported/Eigen/src/KroneckerProduct/**",
"unsupported/Eigen/MatrixFunctions",
"unsupported/Eigen/SpecialFunctions",
"unsupported/Eigen/src/SpecialFunctions/**",
]
# List of files picked up by glob but actually part of another target.
EIGEN_EXCLUDE_FILES = [
"Eigen/src/Core/arch/AVX/PacketMathGoogleTest.cc",
]
# Files known to be under MPL2 license.
EIGEN_MPL2_HEADER_FILES = glob(
EIGEN_FILES,
exclude = EIGEN_EXCLUDE_FILES +
EIGEN_RESTRICTED_FILES +
EIGEN_RESTRICTED_DEPS + [
# Guarantees any file missed by excludes above will not compile.
"Eigen/src/Core/util/NonMPL2.h",
"Eigen/**/CMakeLists.txt",
],
)
cc_library(
name = "eigen",
hdrs = EIGEN_MPL2_HEADER_FILES,
defines = [
# This define (mostly) guarantees we don't link any problematic
# code. We use it, but we do not rely on it, as evidenced above.
"EIGEN_MPL2_ONLY",
],
includes = ["."],
visibility = ["//visibility:public"],
)
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_CORE_MODULE
#define EIGEN_CXX11_CORE_MODULE
#include <Eigen/Core>
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
/** \defgroup CXX11_Core_Module C++11 Core Module
*
* This module provides common core features for all modules that
* explicitly depend on C++11. Currently, this is only the Tensor
* module. Note that at this stage, you should not need to include
* this module directly.
*
* It also provides a limited fallback for compilers that don't support
* CXX11 yet, such as nvcc.
*
* \code
* #include <Eigen/CXX11/Core>
* \endcode
*/
// Only a subset of cxx11 is allowed at Google, so we default to emulate the
// cxx11 functionality that we need.
#include "src/Core/util/FixedSizeVector.h"
#if 1
#include <vector>
#include "src/Core/util/EmulateCXX11Meta.h"
#else
#include "src/Core/util/CXX11Workarounds.h"
#include "src/Core/util/CXX11Meta.h"
#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#endif // EIGEN_CXX11_CORE_MODULE
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_MODULE
#define EIGEN_CXX11_FIXED_POINT_MODULE
#include <Eigen/Core>
#include <stdint.h>
/** \defgroup CXX11_FixedPoint_Module Fixed Point Module
*
* This module provides common core features for all modules that
* explicitly depend on C++11. Currently, this is only the Tensor
* module. Note that at this stage, you should not need to include
* this module directly.
*
* It also provides a limited fallback for compilers that don't support
* CXX11 yet, such as nvcc.
*
* \code
* #include <Eigen/CXX11/FixedPoint>
* \endcode
*/
#include "src/FixedPoint/FixedPointTypes.h"
// Use optimized implementations whenever available
#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
#include "src/FixedPoint/PacketMathAVX512.h"
#include "src/FixedPoint/TypeCastingAVX512.h"
#elif defined EIGEN_VECTORIZE_AVX2
#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
#include "src/FixedPoint/PacketMathAVX2.h"
#include "src/FixedPoint/MatMatProductAVX2.h"
#include "src/FixedPoint/TypeCastingAVX2.h"
#elif defined EIGEN_VECTORIZE_NEON
#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
#include "src/FixedPoint/MatMatProductNEON.h"
#endif
// Use the default implementation when no optimized code is available
#include "src/FixedPoint/MatMatProduct.h"
#include "src/FixedPoint/MatVecProduct.h"
#endif // EIGEN_CXX11_FIXED_POINT_MODULE
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE
#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE
#include "unsupported/Eigen/CXX11/Tensor"
/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module
*
* This module provides an efficient implementation of the common primitives
* used by neural networks.
* The primitives are built on top of the tensor library.
*
* \code
* #include <Eigen/CXX11/NeuralNetworks>
* \endcode
*/
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h"
#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h"
#endif // EIGEN_CXX11_NEURAL_NETWORKS_MODULE
#include "unsupported/Eigen/CXX11/Tensor"
#ifdef _WIN32
#ifndef SLEEP_FUNC_HEADER_GUARD
#define SLEEP_FUNC_HEADER_GUARD
inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
#endif
// On Windows, Eigen will include Windows.h, which defines various
// macros that conflict with TensorFlow symbols. Undefine them here to
// prevent clashes.
#undef DeleteFile
#undef ERROR
#undef LoadLibrary
#endif // _WIN32
#include "unsupported/Eigen/CXX11/ThreadPool"
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H
#define EIGEN_CXX11_FIXED_POINT_TYPES_H
#include <cmath>
#include <iostream>
namespace Eigen {
// The mantissa part of the fixed point representation. See
// go/tensorfixedpoint for details
struct QInt8;
struct QUInt8;
struct QInt16;
struct QUInt16;
struct QInt32;
template <>
struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
template <>
struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
template <>
struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
template <>
struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
template <>
struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
namespace internal {
template <>
struct scalar_product_traits<QInt32, double> {
enum {
// Cost = NumTraits<T>::MulCost,
Defined = 1
};
typedef QInt32 ReturnType;
};
}
// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
// the compiler from silently type cast the mantissa into a bigger or a smaller
// representation.
struct QInt8 {
QInt8() {}
QInt8(const int8_t v) : value(v) {}
QInt8(const QInt32 v);
operator int() const { return static_cast<int>(value); }
int8_t value;
};
struct QUInt8 {
QUInt8() {}
QUInt8(const uint8_t v) : value(v) {}
QUInt8(const QInt32 v);
operator int() const { return static_cast<int>(value); }
uint8_t value;
};
struct QInt16 {
QInt16() {}
QInt16(const int16_t v) : value(v) {}
QInt16(const QInt32 v);
operator int() const { return static_cast<int>(value); }
int16_t value;
};
struct QUInt16 {
QUInt16() {}
QUInt16(const uint16_t v) : value(v) {}
QUInt16(const QInt32 v);
operator int() const { return static_cast<int>(value); }
uint16_t value;
};
struct QInt32 {
QInt32() {}
QInt32(const int8_t v) : value(v) {}
QInt32(const int32_t v) : value(v) {}
QInt32(const uint32_t v) : value(static_cast<int32_t>(v)) {}
QInt32(const QInt8 v) : value(v.value) {}
QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
#ifdef EIGEN_MAKING_DOCS
// Workaround to fix build on PPC.
QInt32(unsigned long v) : value(v) {}
#endif
operator float() const { return static_cast<float>(value); }
int32_t value;
};
EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
: value(v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value)) {}
EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
: value(v.value > 255 ? 255 : (v.value < 0 ? 0 : v.value)) {}
EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
: value(v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value)) {}
EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
: value(v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value)) {}
// Basic widening 8-bit operations: This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
}
// Basic widening 16-bit operations: This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
}
// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
return QInt32(a.value + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) + b.value);
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
return QInt32(a.value - static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) - b.value);
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
return QInt32(a.value * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) * b.value);
}
// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
return QInt32(a.value + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) + b.value);
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
return QInt32(a.value - static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) - b.value);
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
return QInt32(a.value * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) * b.value);
}
// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
return QInt32(a.value + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) + b.value);
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
return QInt32(a.value - static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) - b.value);
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
return QInt32(a.value * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) * b.value);
}
// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
return QInt32(a.value + static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) + b.value);
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
return QInt32(a.value - static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) - b.value);
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
return QInt32(a.value * static_cast<int32_t>(b.value));
}
EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
return QInt32(static_cast<int32_t>(a.value) * b.value);
}
// Basic arithmetic operations on QInt32, which behaves like a int32_t.
EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
return a.value + b.value;
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
return a.value - b.value;
}
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
return a.value * b.value;
}
EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
return a.value / b.value;
}
EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
a.value += b.value;
return a;
}
EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
a.value -= b.value;
return a;
}
EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
a.value *= b.value;
return a;
}
EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
a.value /= b.value;
return a;
}
EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
return -a.value;
}
// Scaling QInt32 by double. We do the arithmetic in double because
// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
// accuracy by discarding up to 7 (least significant) bits.
EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
}
EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
}
EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
return a;
}
// Comparisons
EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
return a.value == b.value;
}
EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
return a.value == b.value;
}
EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
return a.value == b.value;
}
EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
return a.value == b.value;
}
EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
return a.value == b.value;
}
EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
return a.value < b.value;
}
EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
return a.value < b.value;
}
EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
return a.value < b.value;
}
EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
return a.value < b.value;
}
EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
return a.value < b.value;
}
EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
return a.value > b.value;
}
EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
return a.value > b.value;
}
EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
return a.value > b.value;
}
EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
return a.value > b.value;
}
EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
return a.value > b.value;
}
EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
os << static_cast<int>(a.value);
return os;
}
EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
os << static_cast<int>(a.value);
return os;
}
EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
os << static_cast<int>(a.value);
return os;
}
EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
os << static_cast<int>(a.value);
return os;
}
EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
os << a.value;
return os;
}
} // namespace Eigen
#endif // EIGEN_CXX11_FIXED_POINT_TYPES_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
namespace Eigen {
namespace internal {
// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
// overflows
template<> struct scalar_product_traits<QInt8, QInt8>
{
enum {
Defined = 1
};
typedef QInt32 ReturnType;
};
// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
// to prevent overflows
template<> struct scalar_product_traits<QInt8, QUInt8>
{
enum {
Defined = 1
};
typedef QInt32 ReturnType;
};
// Description of the product implementation. It's pretty simple now since
// nothing is vectorized yet.
// This definition tackle the case where both lhs and rhs are encoded using
// signed 8bit integers
#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
template<bool _ConjLhs, bool _ConjRhs>
class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
{
public:
typedef QInt8 LhsScalar;
typedef QInt8 RhsScalar;
typedef QInt32 ResScalar;
enum {
// register block size along the M and N directions
// One for the current implementation
nr = 1,
mr = 1,
// Progress made at each iteration of the product loop
// also 1 for the current implementation
LhsProgress = 1,
RhsProgress = 1
};
};
// The signed 8bit Mat-Mat product itself.
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
for (Index j = 0; j < cols; ++j) {
Index startB = j * depth;
for (Index i = 0; i < rows; ++i) {
Index startA = i * depth;
for (Index k = 0; k < depth; ++k) {
res(i, j) += blockA[startA + k] * blockB[startB + k];
}
}
}
}
#endif
// This definition tackle the case where the lhs is encoded using signed 8bit
// integers and the rhs using unsigned 8bit integers.
#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
template<bool _ConjLhs, bool _ConjRhs>
class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
{
public:
typedef QInt8 LhsScalar;
typedef QUInt8 RhsScalar;
typedef QInt32 ResScalar;
enum {
// register block size along the M and N directions
// One for the current implementation
nr = 1,
mr = 1,
// Progress made at each iteration of the product loop
// also 1 for the current implementation
LhsProgress = 1,
RhsProgress = 1
};
};
// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
for (Index j = 0; j < cols; ++j) {
Index startB = j * depth;
for (Index i = 0; i < rows; ++i) {
Index startA = i * depth;
for (Index k = 0; k < depth; ++k) {
res(i, j) += blockA[startA + k] * blockB[startB + k];
}
}
}
}
#endif
// This definition tackle the case where the khs is encoded using unsigned 8bit
// integers and the rhs using signed 8bit integers.
#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
template<bool _ConjLhs, bool _ConjRhs>
class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
{
public:
typedef QUInt8 LhsScalar;
typedef QInt8 RhsScalar;
typedef QInt32 ResScalar;
enum {
// register block size along the M and N directions
// One for the current implementation
nr = 1,
mr = 1,
// Progress made at each iteration of the product loop
// also 1 for the current implementation
LhsProgress = 1,
RhsProgress = 1
};
};
// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
for (Index j = 0; j < cols; ++j) {
Index startB = j * depth;
for (Index i = 0; i < rows; ++i) {
Index startA = i * depth;
for (Index k = 0; k < depth; ++k) {
res(i, j) += blockA[startA + k] * blockB[startB + k];
}
}
}
}
#endif
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2015 Matthew Sarett <msarett@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
namespace Eigen {
namespace internal {
// AVX2 optimized implementation of Mat-Mat product.
// LHS is encoded using signed 8-bit integers.
// RHS is encoded using unsigned 8-bit integers.
#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
// Define quantized traits
template<bool _ConjLhs, bool _ConjRhs>
class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
{
public:
typedef QInt8 LhsScalar;
typedef QUInt8 RhsScalar;
typedef QInt32 ResScalar;
enum {
// Define register blocking scheme.
nr = 32,
mr = 32,
kr = 8,
// Ignore progress tracking per loop iteration.
LhsProgress = -1,
RhsProgress = -1
};
};
// Specialized blocking for quantized implementations.
// Used by TensorContractionThreadPool, inputs must have dimensions that are
// multiples of 32.
template<typename Index,
typename LeftTensor,
typename left_nocontract_t, typename left_contract_t,
bool left_inner_dim_contiguous, bool left_inner_dim_reordered, int LeftAlignment,
typename RightTensor,
typename right_nocontract_t, typename right_contract_t,
bool right_inner_dim_contiguous, bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor, right_nocontract_t, right_contract_t, 32, right_inner_dim_contiguous, right_inner_dim_reordered, RightAlignment>, Index, ShardingType> {
public:
typedef QInt8 LhsScalar;
typedef QUInt8 RhsScalar;
TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
eigen_assert(m % 32 == 0);
eigen_assert(k % 32 == 0);
if (!k || !m || !n) {
return;
}
if (ShardingType == ShardByCol) {
eigen_assert(n % 32 == 0);
nc_ = (((n / num_threads) + 31) / 32) * 32;
}
else {
eigen_assert(n % 32 == 0 || n == 1);
// Special case to avoid breaking the unimplemented matrix-vector case
if (n == 1) {
nc_ = 32;
}
mc_ = (((m / num_threads) + 31) / 32) * 32;
}
}
EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
private:
Index kc_;
Index mc_;
Index nc_;
};
// Specialized blocking for quantized implementations.
// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
// multiples of 32.
template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
KcFactor, false>
: public level3_blocking<QInt8, QInt8> {
DenseIndex m_sizeA;
DenseIndex m_sizeB;
public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
this->m_mc = ((rows + 31) / 32) * 32;
this->m_nc = ((cols + 31) / 32) * 32;
this->m_kc = ((depth + 31) / 32) * 32;
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
void allocateA() {
if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
}
void allocateB() {
if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
}
void allocateAll() {
allocateA();
allocateB();
}
~gemm_blocking_space() {
aligned_delete(this->m_blockA, m_sizeA);
aligned_delete(this->m_blockB, m_sizeB);
}
};
template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
KcFactor, false>
: public level3_blocking<QInt8, QUInt8> {
DenseIndex m_sizeA;
DenseIndex m_sizeB;
public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
this->m_mc = ((rows + 31) / 32) * 32;
this->m_nc = ((cols + 31) / 32) * 32;
this->m_kc = ((depth + 31) / 32) * 32;
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
void allocateA() {
if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
}
void allocateB() {
if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
}
void allocateAll() {
allocateA();
allocateB();
}
~gemm_blocking_space() {
aligned_delete(this->m_blockA, m_sizeA);
aligned_delete(this->m_blockB, m_sizeB);
}
};
// Alternate templates for any input sizes
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs_any;
template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
EIGEN_DONT_INLINE void operator()
(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs_any;
template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
EIGEN_DONT_INLINE void operator()
(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct gebp_kernel_any;
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
// Alternate implementations for any input sizes
template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
eigen_assert(stride == 0);
eigen_assert(offset == 0);
// Get vector pointer
__m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
// Get even multiples of the dimensions
Index rows_32 = (rows / 32) * 32;
Index depth_8 = (depth / 8) * 8;
// Get padding for when depth is not a multiple of 32
int padding = 0;
if (depth % 32 != 0) {
int depth_32 = (depth / 32) * 32;
int extra_depth = depth - depth_32;
int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
padding = 32 - extra_depth_8;
}
// Pack rows in sets of 32
for (Index m = 0; m < rows_32; m += 32) {
// Pack depth in sets of 8
for (Index k = 0; k < depth_8; k += 8) {
// Load vectors
__m256i L_A = lhs.loadPacket(m, k);
__m256i L_B = lhs.loadPacket(m, k + 1);
// Interleave 8-bit elements
__m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
__m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
__m256i L_C = lhs.loadPacket(m, k + 2);
__m256i L_D = lhs.loadPacket(m, k + 3);
__m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
__m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
// Interleave 16-bit elements
__m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
__m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
// Use permute before we store to cross 128-bit lanes
__m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
_mm256_store_si256(blockA_256++, L_AD0);
// Complete packing for 32 x 8 block
__m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
__m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
_mm256_store_si256(blockA_256++, L_AD8);
_mm256_store_si256(blockA_256++, L_AD16);
__m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
_mm256_store_si256(blockA_256++, L_AD24);
__m256i L_E = lhs.loadPacket(m, k + 4);
__m256i L_F = lhs.loadPacket(m, k + 5);
__m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
__m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
__m256i L_G = lhs.loadPacket(m, k + 6);
__m256i L_H = lhs.loadPacket(m, k + 7);
__m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
__m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
__m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
_mm256_store_si256(blockA_256++, L_EH0);
__m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
__m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
_mm256_store_si256(blockA_256++, L_EH8);
_mm256_store_si256(blockA_256++, L_EH16);
__m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
_mm256_store_si256(blockA_256++, L_EH24);
}
// Finish the k dimension, padding with zeros
if (depth_8 < depth) {
__m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
switch (depth - depth_8) {
case 1:
L_A = lhs.loadPacket(m, depth_8);
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 2:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 3:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = lhs.loadPacket(m, depth_8 + 2);
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 4:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = lhs.loadPacket(m, depth_8 + 2);
L_D = lhs.loadPacket(m, depth_8 + 3);
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 5:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = lhs.loadPacket(m, depth_8 + 2);
L_D = lhs.loadPacket(m, depth_8 + 3);
L_E = lhs.loadPacket(m, depth_8 + 4);
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 6:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = lhs.loadPacket(m, depth_8 + 2);
L_D = lhs.loadPacket(m, depth_8 + 3);
L_E = lhs.loadPacket(m, depth_8 + 4);
L_F = lhs.loadPacket(m, depth_8 + 5);
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
break;
case 7:
L_A = lhs.loadPacket(m, depth_8);
L_B = lhs.loadPacket(m, depth_8 + 1);
L_C = lhs.loadPacket(m, depth_8 + 2);
L_D = lhs.loadPacket(m, depth_8 + 3);
L_E = lhs.loadPacket(m, depth_8 + 4);
L_F = lhs.loadPacket(m, depth_8 + 5);
L_G = lhs.loadPacket(m, depth_8 + 6);
L_H = _mm256_setzero_si256();
break;
}
// Interleave 8-bit elements
__m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
__m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
__m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
__m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
// Interleave 16-bit elements
__m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
__m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
// Use permute before we store to cross 128-bit lanes
__m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
_mm256_store_si256(blockA_256++, L_AD0);
// Complete packing
__m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
__m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
_mm256_store_si256(blockA_256++, L_AD8);
_mm256_store_si256(blockA_256++, L_AD16);
__m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
_mm256_store_si256(blockA_256++, L_AD24);
__m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
__m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
__m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
__m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
__m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
_mm256_store_si256(blockA_256++, L_EH0);
__m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
__m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
_mm256_store_si256(blockA_256++, L_EH8);
_mm256_store_si256(blockA_256++, L_EH16);
__m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
_mm256_store_si256(blockA_256++, L_EH24);
}
blockA_256 += padding;
}
// Finish the m dimension, padding with zeros
if (rows_32 < rows) {
// Pack depth in sets of 8
for (Index k = 0; k < depth_8; k += 8) {
// Load vectors
__m256i L_A = _mm256_setzero_si256();
__m256i L_B = _mm256_setzero_si256();
__m256i L_C = _mm256_setzero_si256();
__m256i L_D = _mm256_setzero_si256();
__m256i L_E = _mm256_setzero_si256();
__m256i L_F = _mm256_setzero_si256();
__m256i L_G = _mm256_setzero_si256();
__m256i L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
QInt8* ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, k);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, k + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, k + 2);
ptr = (QInt8*) &L_D;
ptr[m] = lhs(rows_32 + m, k + 3);
ptr = (QInt8*) &L_E;
ptr[m] = lhs(rows_32 + m, k + 4);
ptr = (QInt8*) &L_F;
ptr[m] = lhs(rows_32 + m, k + 5);
ptr = (QInt8*) &L_G;
ptr[m] = lhs(rows_32 + m, k + 6);
ptr = (QInt8*) &L_H;
ptr[m] = lhs(rows_32 + m, k + 7);
}
// Interleave 8-bit elements
__m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
__m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
__m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
__m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
// Interleave 16-bit elements
__m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
__m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
// Use permute before we store to cross 128-bit lanes
__m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
_mm256_store_si256(blockA_256++, L_AD0);
// Complete packing for 32 x 8 block
__m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
__m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
_mm256_store_si256(blockA_256++, L_AD8);
_mm256_store_si256(blockA_256++, L_AD16);
__m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
_mm256_store_si256(blockA_256++, L_AD24);
__m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
__m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
__m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
__m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
__m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
_mm256_store_si256(blockA_256++, L_EH0);
__m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
__m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
_mm256_store_si256(blockA_256++, L_EH8);
_mm256_store_si256(blockA_256++, L_EH16);
__m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
_mm256_store_si256(blockA_256++, L_EH24);
}
// Finish the k dimension, padding with zeros
if (depth_8 < depth) {
__m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
QInt8* ptr;
switch (depth - depth_8) {
case 1:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
QInt8* ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
}
break;
case 2:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
}
break;
case 3:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, depth_8 + 2);
}
break;
case 4:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, depth_8 + 2);
ptr = (QInt8*) &L_D;
ptr[m] = lhs(rows_32 + m, depth_8 + 3);
}
break;
case 5:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, depth_8 + 2);
ptr = (QInt8*) &L_D;
ptr[m] = lhs(rows_32 + m, depth_8 + 3);
ptr = (QInt8*) &L_E;
ptr[m] = lhs(rows_32 + m, depth_8 + 4);
}
break;
case 6:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, depth_8 + 2);
ptr = (QInt8*) &L_D;
ptr[m] = lhs(rows_32 + m, depth_8 + 3);
ptr = (QInt8*) &L_E;
ptr[m] = lhs(rows_32 + m, depth_8 + 4);
ptr = (QInt8*) &L_F;
ptr[m] = lhs(rows_32 + m, depth_8 + 5);
}
break;
case 7:
L_A = _mm256_setzero_si256();
L_B = _mm256_setzero_si256();
L_C = _mm256_setzero_si256();
L_D = _mm256_setzero_si256();
L_E = _mm256_setzero_si256();
L_F = _mm256_setzero_si256();
L_G = _mm256_setzero_si256();
L_H = _mm256_setzero_si256();
for (Index m = 0; m < rows - rows_32; m++) {
ptr = (QInt8*) &L_A;
ptr[m] = lhs(rows_32 + m, depth_8);
ptr = (QInt8*) &L_B;
ptr[m] = lhs(rows_32 + m, depth_8 + 1);
ptr = (QInt8*) &L_C;
ptr[m] = lhs(rows_32 + m, depth_8 + 2);
ptr = (QInt8*) &L_D;
ptr[m] = lhs(rows_32 + m, depth_8 + 3);
ptr = (QInt8*) &L_E;
ptr[m] = lhs(rows_32 + m, depth_8 + 4);
ptr = (QInt8*) &L_F;
ptr[m] = lhs(rows_32 + m, depth_8 + 5);
ptr = (QInt8*) &L_G;
ptr[m] = lhs(rows_32 + m, depth_8 + 6);
}
break;
}
// Interleave 8-bit elements
__m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
__m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
__m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
__m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
// Interleave 16-bit elements
__m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
__m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
// Use permute before we store to cross 128-bit lanes
__m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
_mm256_store_si256(blockA_256++, L_AD0);
// Complete packing
__m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
__m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
_mm256_store_si256(blockA_256++, L_AD8);
_mm256_store_si256(blockA_256++, L_AD16);
__m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
_mm256_store_si256(blockA_256++, L_AD24);
__m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
__m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
__m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
__m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
__m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
_mm256_store_si256(blockA_256++, L_EH0);
__m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
__m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
_mm256_store_si256(blockA_256++, L_EH8);
_mm256_store_si256(blockA_256++, L_EH16);
__m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
_mm256_store_si256(blockA_256++, L_EH24);
}
}
}
template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
eigen_assert(stride == 0);
eigen_assert(offset == 0);
// Get vector pointer
__m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
// Get even multiples of the dimensions
Index cols_32 = (cols / 32) * 32;
Index depth_32 = (depth / 32) * 32;
// Perform a step of the packing for 4 columns
__m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
#define PACK_STEP \
R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \
R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \
R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \
R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \
R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \
R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \
R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
_mm256_store_si256(blockB_256, R_AD_0); \
_mm256_store_si256(blockB_256 + 8, R_AD_8); \
_mm256_store_si256(blockB_256 + 16, R_AD_16); \
_mm256_store_si256(blockB_256 + 24, R_AD_24); \
blockB_256++;
// Pack cols in sets of 32
for (Index n = 0; n < cols_32; n += 32) {
// Pack depth in sets of 32
for (Index k = 0; k < depth_32; k += 32) {
__m256i R_A = rhs.loadPacket(k, n);
__m256i R_B = rhs.loadPacket(k, n + 1);
__m256i R_C = rhs.loadPacket(k, n + 2);
__m256i R_D = rhs.loadPacket(k, n + 3);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 4);
R_B = rhs.loadPacket(k, n + 5);
R_C = rhs.loadPacket(k, n + 6);
R_D = rhs.loadPacket(k, n + 7);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 8);
R_B = rhs.loadPacket(k, n + 9);
R_C = rhs.loadPacket(k, n + 10);
R_D = rhs.loadPacket(k, n + 11);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 12);
R_B = rhs.loadPacket(k, n + 13);
R_C = rhs.loadPacket(k, n + 14);
R_D = rhs.loadPacket(k, n + 15);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 16);
R_B = rhs.loadPacket(k, n + 17);
R_C = rhs.loadPacket(k, n + 18);
R_D = rhs.loadPacket(k, n + 19);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 20);
R_B = rhs.loadPacket(k, n + 21);
R_C = rhs.loadPacket(k, n + 22);
R_D = rhs.loadPacket(k, n + 23);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 24);
R_B = rhs.loadPacket(k, n + 25);
R_C = rhs.loadPacket(k, n + 26);
R_D = rhs.loadPacket(k, n + 27);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 28);
R_B = rhs.loadPacket(k, n + 29);
R_C = rhs.loadPacket(k, n + 30);
R_D = rhs.loadPacket(k, n + 31);
PACK_STEP;
blockB_256 += 24;
}
if (depth_32 < depth) {
QUInt8* ptr;
__m256i R_A = _mm256_setzero_si256();
__m256i R_B = _mm256_setzero_si256();
__m256i R_C = _mm256_setzero_si256();
__m256i R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 1);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 2);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 3);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 4);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 5);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 6);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 7);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 8);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 9);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 10);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 11);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 12);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 13);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 14);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 15);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 16);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 17);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 18);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 19);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 20);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 21);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 22);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 23);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 24);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 25);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 26);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 27);
}
PACK_STEP;
R_A = _mm256_setzero_si256();
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n + 28);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 29);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 30);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 31);
}
PACK_STEP;
blockB_256 += 24;
}
}
// Finish packing cols
if (cols_32 < cols) {
// Pack depth in sets of 32
for (Index k = 0; k < depth_32; k += 32) {
__m256i R_A, R_B, R_C, R_D;
Index n;
for (n = cols_32; n < cols; n += 4) {
switch (cols - n) {
case 1:
R_A = rhs.loadPacket(k, n);
R_B = _mm256_setzero_si256();
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
PACK_STEP;
break;
case 2:
R_A = rhs.loadPacket(k, n);
R_B = rhs.loadPacket(k, n + 1);
R_C = _mm256_setzero_si256();
R_D = _mm256_setzero_si256();
PACK_STEP;
break;
case 3:
R_A = rhs.loadPacket(k, n);
R_B = rhs.loadPacket(k, n + 1);
R_C = rhs.loadPacket(k, n + 2);
R_D = _mm256_setzero_si256();
PACK_STEP;
break;
default:
R_A = rhs.loadPacket(k, n);
R_B = rhs.loadPacket(k, n + 1);
R_C = rhs.loadPacket(k, n + 2);
R_D = rhs.loadPacket(k, n + 3);
PACK_STEP;
break;
}
}
// Increment the block pointer.
// We must pad if cols is not a multiple of 32.
blockB_256 += 32 - (n - cols_32) / 4;
}
if (depth_32 < depth) {
for (Index n = cols_32; n < cols; n += 4) {
QUInt8* ptr;
__m256i R_A = _mm256_setzero_si256();
__m256i R_B = _mm256_setzero_si256();
__m256i R_C = _mm256_setzero_si256();
__m256i R_D = _mm256_setzero_si256();
switch (cols - n) {
case 1:
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n);
}
PACK_STEP;
break;
case 2:
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 1);
}
PACK_STEP;
break;
case 3:
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 1);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 2);
}
PACK_STEP;
break;
default:
for (Index k = depth_32; k < depth; k++) {
ptr = (QUInt8*) &R_A;
ptr[k - depth_32] = rhs(k, n);
ptr = (QUInt8*) &R_B;
ptr[k - depth_32] = rhs(k, n + 1);
ptr = (QUInt8*) &R_C;
ptr[k - depth_32] = rhs(k, n + 2);
ptr = (QUInt8*) &R_D;
ptr[k - depth_32] = rhs(k, n + 3);
}
PACK_STEP;
break;
}
}
}
}
#undef PACK_STEP
}
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
Index rows_32 = ((rows + 31) / 32) * 32;
Index cols_32 = ((cols + 31) / 32) * 32;
Index depth_32 = ((depth + 31) / 32) * 32;
// Create result block
ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
memset(blockO, 0, 32 * 32 * sizeof(QInt32));
// Get vectorized pointers
__m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
// Loop over blocks of 32 columns
for (Index n = 0; n < cols_32; n += 32) {
// Reset index into blockA
Index indexL = 0;
// Loop over blocks of 32 rows
for (Index m = 0; m < rows_32; m += 32) {
// Reset index into blockB
Index indexR = n / 32 * depth_32;
// Loop over blocks of 8 on depth
for (Index k = 0; k < depth_32; k += 8) {
// Load inputs
__m256i L_AD0 = blockA_256[indexL++];
__m256i L_AD8 = blockA_256[indexL++];
__m256i L_AD16 = blockA_256[indexL++];
__m256i L_AD24 = blockA_256[indexL++];
__m256i L_EH0 = blockA_256[indexL++];
__m256i L_EH8 = blockA_256[indexL++];
__m256i L_EH16 = blockA_256[indexL++];
__m256i L_EH24 = blockA_256[indexL++];
__m256i R_AH0 = blockB_256[indexR++];
__m256i R_AH4 = blockB_256[indexR++];
__m256i R_AH8 = blockB_256[indexR++];
__m256i R_AH12 = blockB_256[indexR++];
__m256i R_AH16 = blockB_256[indexR++];
__m256i R_AH20 = blockB_256[indexR++];
__m256i R_AH24 = blockB_256[indexR++];
__m256i R_AH28 = blockB_256[indexR++];
// This constant is used with madd to convert 16 bit to 32 bit
const __m256i ONE = _mm256_set1_epi32(0x00010001);
// Declare variables used in COMPUTE_STEP
__m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 1, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 2, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 3, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
// Permute and shuffle to copy a single value across the entire vector
// Then compute the multiplication
__m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
__m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
__m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 0);
__m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
__m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 1);
R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
__m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
__m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 2);
__m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
__m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 3);
R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 4);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 5);
R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 6);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 7);
R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 8);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 9);
R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 10);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 11);
R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 12);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 13);
R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 14);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 15);
R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 16);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 17);
R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 18);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 19);
R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 20);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 21);
R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 22);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 23);
R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 24);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 25);
R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 26);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 27);
R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 28);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 29);
R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 30);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 31);
#undef COMPUTE_STEP
}
// Transfer the results to the result matrix.
if (m + 32 <= rows && n + 32 <= cols) {
Index i = 0;
for (Index j = n; j < n + 32; j++) {
LinearMapper r0 = res.getLinearMapper(m, j);
LinearMapper r1 = res.getLinearMapper(m + 8, j);
LinearMapper r2 = res.getLinearMapper(m + 16, j);
LinearMapper r3 = res.getLinearMapper(m + 24, j);
r0.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
r1.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
r2.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
r3.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
}
}
else {
for (Index j = n; j < cols; j++) {
for (Index i = m; i < rows; i++) {
res(i, j) = blockO[(j - n) * 32 + (i - m)];
}
}
}
// Zero the result block so it can be reused
memset(blockO, 0, 32 * 32 * sizeof(QInt32));
}
}
}
// Below are the fully optimized versions that are correct only for sizes that
// are multiple of 32. It is about a 10% performance benefit to keep these
// implementations separate.
// Arrange a block of the left input matrix in contiguous memory.
//
// Given column major input (A0 beside A1 in memory):
// A0 B0 C0 D0 E0 F0 G0 H0 ...
// A1 B1 C1 D1 E1 F1 G1 H1 ...
// A2 B2 C2 D2 E2 F2 G2 H2 ...
// A3 B3 C3 D3 E3 F3 G3 H3 ...
// A4 B4 C4 D4 E4 F4 G4 H4 ...
// A5 B5 C5 D5 E5 F5 G5 H5 ...
// A6 B6 C6 D6 E6 F6 G6 H6 ...
// A7 B7 C7 D7 E7 F7 G7 H7 ...
// A8 ...
// ...
//
// Packing yields output (A0 beside B0 in memory):
// A0 B0 C0 D0
// A1 B1 C1 D1
// A2 B2 C2 D2
// A3 B3 C3 D3
// A4 B4 C4 D4
// A5 B5 C5 D5
// A6 B6 C6 D6
// A7 B7 C7 D7
// ...
// A31 B31 C31 D31
// E0 F0 G0 H0
// E1 F1 G1 H1
// E2 F2 G2 H2
// E3 F3 G3 H3
// E4 F4 G4 H4
// E5 F5 G5 H5
// E6 F6 G6 H6
// E7 F7 G7 H7
// ...
//
// Four elements of the same row are arranged contiguously because maddubs and
// madd both perform an adjacent addition in the kernel.
template <typename Index, typename DataMapper, int Pack1, int Pack2,
bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
Conjugate, PanelMode> {
EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
Index depth, Index rows, Index stride = 0,
Index offset = 0);
};
template <typename Index, typename DataMapper, int Pack1, int Pack2,
bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
ColMajor, Conjugate, PanelMode>::
operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
Index stride, Index offset) {
eigen_assert(stride == 0);
eigen_assert(offset == 0);
// Use alternate function for weird sizes
if (rows % 32 != 0 || depth % 32 != 0) {
gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
return lhs_pack(blockA, lhs, depth, rows, stride, offset);
}
// Get vector pointer
__m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
// Pack rows in sets of 32
for (Index m = 0; m < rows; m += 32) {
// Pack depth in sets of 8
for (Index k = 0; k < depth; k += 8) {
// Load vectors
__m256i L_A = lhs.loadPacket(m, k);
__m256i L_B = lhs.loadPacket(m, k + 1);
// Interleave 8-bit elements
__m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
__m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
__m256i L_C = lhs.loadPacket(m, k + 2);
__m256i L_D = lhs.loadPacket(m, k + 3);
__m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
__m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
// Interleave 16-bit elements
__m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
__m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
// Use permute before we store to cross 128-bit lanes
__m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
_mm256_store_si256(blockA_256++, L_AD0);
// Complete packing for 32 x 8 block
__m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
__m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
__m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
_mm256_store_si256(blockA_256++, L_AD8);
_mm256_store_si256(blockA_256++, L_AD16);
__m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
_mm256_store_si256(blockA_256++, L_AD24);
__m256i L_E = lhs.loadPacket(m, k + 4);
__m256i L_F = lhs.loadPacket(m, k + 5);
__m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
__m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
__m256i L_G = lhs.loadPacket(m, k + 6);
__m256i L_H = lhs.loadPacket(m, k + 7);
__m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
__m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
__m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
__m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
_mm256_store_si256(blockA_256++, L_EH0);
__m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
__m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
__m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
_mm256_store_si256(blockA_256++, L_EH8);
_mm256_store_si256(blockA_256++, L_EH16);
__m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
_mm256_store_si256(blockA_256++, L_EH24);
}
}
}
// Arrange a block of the right input matrix in contiguous memory.
//
// Given column major input (A0 beside A1 in memory):
// A0 B0 C0 D0 E0 F0 G0 H0 ...
// A1 B1 C1 D1 E1 F1 G1 H1 ...
// A2 B2 C2 D2 E2 F2 G2 H2 ...
// A3 B3 C3 D3 E3 F3 G3 H3 ...
// A4 B4 C4 D4 E4 F4 G4 H4 ...
// A5 B5 C5 D5 E5 F5 G5 H5 ...
// A6 B6 C6 D6 E6 F6 G6 H6 ...
// A7 B7 C7 D7 E7 F7 G7 H7 ...
// A8 ...
// ...
//
// Packing yields row major output (A0 beside A1 in memory):
// A0 A1 A2 A3 A4 A5 A6 A7
// B0 B1 B2 B3 B4 B5 B6 B7
// ...
//
// At least four elements of the same col are arranged contiguously because
// maddubs and madd both perform an adjacent addition in the kernel. We can
// save work by leaving 8 adjacent elements because kr = 8.
template <typename Index, typename DataMapper, int nr, bool Conjugate,
bool PanelMode>
struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
PanelMode> {
EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
Index depth, Index cols, Index stride = 0,
Index offset = 0);
};
template <typename Index, typename DataMapper, int nr, bool Conjugate,
bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor,
Conjugate, PanelMode>::
operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
Index stride, Index offset) {
eigen_assert(stride == 0);
eigen_assert(offset == 0);
// Use alternate function for weird sizes
if (cols % 32 != 0 || depth % 32 != 0) {
gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
return rhs_pack(blockB, rhs, depth, cols, stride, offset);
}
// Get vector pointer
__m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
// Perform a step of the packing for 4 columns
__m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
#define PACK_STEP \
R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \
R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \
R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \
R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \
R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \
R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \
R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
_mm256_store_si256(blockB_256, R_AD_0); \
_mm256_store_si256(blockB_256 + 8, R_AD_8); \
_mm256_store_si256(blockB_256 + 16, R_AD_16); \
_mm256_store_si256(blockB_256 + 24, R_AD_24); \
blockB_256++;
// Pack cols in sets of 32
for (Index n = 0; n < cols; n += 32) {
// Pack depth in sets of 32
for (Index k = 0; k < depth; k += 32) {
__m256i R_A = rhs.loadPacket(k, n);
__m256i R_B = rhs.loadPacket(k, n + 1);
__m256i R_C = rhs.loadPacket(k, n + 2);
__m256i R_D = rhs.loadPacket(k, n + 3);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 4);
R_B = rhs.loadPacket(k, n + 5);
R_C = rhs.loadPacket(k, n + 6);
R_D = rhs.loadPacket(k, n + 7);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 8);
R_B = rhs.loadPacket(k, n + 9);
R_C = rhs.loadPacket(k, n + 10);
R_D = rhs.loadPacket(k, n + 11);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 12);
R_B = rhs.loadPacket(k, n + 13);
R_C = rhs.loadPacket(k, n + 14);
R_D = rhs.loadPacket(k, n + 15);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 16);
R_B = rhs.loadPacket(k, n + 17);
R_C = rhs.loadPacket(k, n + 18);
R_D = rhs.loadPacket(k, n + 19);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 20);
R_B = rhs.loadPacket(k, n + 21);
R_C = rhs.loadPacket(k, n + 22);
R_D = rhs.loadPacket(k, n + 23);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 24);
R_B = rhs.loadPacket(k, n + 25);
R_C = rhs.loadPacket(k, n + 26);
R_D = rhs.loadPacket(k, n + 27);
PACK_STEP;
R_A = rhs.loadPacket(k, n + 28);
R_B = rhs.loadPacket(k, n + 29);
R_C = rhs.loadPacket(k, n + 30);
R_D = rhs.loadPacket(k, n + 31);
PACK_STEP;
blockB_256 += 24;
}
}
#undef PACK_STEP
}
// Perform the actual multiplication on packed inputs
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
// Use alternate function for weird sizes
if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
}
// Create result block
QInt32* blockO = aligned_new<QInt32>(32 * 32);
// Allocating the result block is about 5-10% faster than declaring stack
// space. It is unclear why this is the case.
// ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
memset(blockO, 0, 32 * 32 * sizeof(QInt32));
// Get vectorized pointers
__m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
// Loop over blocks of 32 columns
for (Index n = 0; n < cols; n += 32) {
// Reset index into blockA
Index indexL = 0;
// Loop over blocks of 32 rows
for (Index m = 0; m < rows; m += 32) {
// Reset index into blockB
Index indexR = n / 32 * depth;
// Loop over blocks of 8 on depth
for (Index k = 0; k < depth; k += 8) {
// Load inputs
__m256i L_AD0 = blockA_256[indexL++];
__m256i L_AD8 = blockA_256[indexL++];
__m256i L_AD16 = blockA_256[indexL++];
__m256i L_AD24 = blockA_256[indexL++];
__m256i L_EH0 = blockA_256[indexL++];
__m256i L_EH8 = blockA_256[indexL++];
__m256i L_EH16 = blockA_256[indexL++];
__m256i L_EH24 = blockA_256[indexL++];
__m256i R_AH0 = blockB_256[indexR++];
__m256i R_AH4 = blockB_256[indexR++];
__m256i R_AH8 = blockB_256[indexR++];
__m256i R_AH12 = blockB_256[indexR++];
__m256i R_AH16 = blockB_256[indexR++];
__m256i R_AH20 = blockB_256[indexR++];
__m256i R_AH24 = blockB_256[indexR++];
__m256i R_AH28 = blockB_256[indexR++];
// This constant is used with madd to convert 16 bit to 32 bit
const __m256i ONE = _mm256_set1_epi32(0x00010001);
// Declare variables used in COMPUTE_STEP
__m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 1, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 2, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
\
P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \
P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \
P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
_mm256_store_si256( \
blockO_256 + 4 * OFFSET + 3, \
_mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
// Permute and shuffle to copy a single value across the entire vector
// Then compute the multiplication
__m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
__m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
__m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 0);
__m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
__m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 1);
R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
__m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
__m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 2);
__m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
__m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 3);
R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 4);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 5);
R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 6);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 7);
R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 8);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 9);
R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 10);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 11);
R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 12);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 13);
R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 14);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 15);
R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 16);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 17);
R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 18);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 19);
R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 20);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 21);
R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 22);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 23);
R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 24);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 25);
R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 26);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 27);
R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD0, R_EH0, 28);
R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD1, R_EH1, 29);
R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
COMPUTE_STEP(R_AD2, R_EH2, 30);
R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
COMPUTE_STEP(R_AD3, R_EH3, 31);
#undef COMPUTE_STEP
}
// Transfer the results to the result matrix
Index i = 0;
for (Index j = n; j < n + 32; j++) {
LinearMapper r0 = res.getLinearMapper(m, j);
LinearMapper r1 = res.getLinearMapper(m + 8, j);
LinearMapper r2 = res.getLinearMapper(m + 16, j);
LinearMapper r3 = res.getLinearMapper(m + 24, j);
r0.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
r1.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
r2.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
r3.storePacket(
0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
}
// Zero the result block so it can be reused
memset(blockO, 0, 32 * 32 * sizeof(QInt32));
}
}
aligned_delete(blockO, 32 * 32);
}
#endif // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
namespace Eigen {
namespace internal {
// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
// integers and the rhs using unsigned 8bit integers.
#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
template<bool _ConjLhs, bool _ConjRhs>
class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
{
public:
typedef QInt8 LhsScalar;
typedef QUInt8 RhsScalar;
typedef QInt32 ResScalar;
enum {
// register block size along the M and N directions
// One for the current implementation
nr = 1,
mr = 1,
// Progress made at each iteration of the product loop
// also 1 for the current implementation
LhsProgress = 1,
RhsProgress = 1
};
};
// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
EIGEN_DONT_INLINE
void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
Index rows, Index depth, Index cols, QInt32 alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(alpha.value == 1);
eigen_assert(strideA == -1);
eigen_assert(strideB == -1);
eigen_assert(offsetA == 0);
eigen_assert(offsetB == 0);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
eigen_assert(depth > 0);
eigen_assert(blockA);
eigen_assert(blockB);
for (Index j = 0; j < cols; ++j) {
Index startB = j * depth;
for (Index i = 0; i < rows; ++i) {
Index startA = i * depth;
for (Index k = 0; k < depth; ++k) {
res(i, j) += blockA[startA + k] * blockB[startB + k];
}
}
}
}
#endif
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
namespace Eigen {
namespace internal {
// Mat-Vec product
// Both lhs and rhs are encoded as 8bit signed integers
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
{
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QInt8 alpha);
};
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QInt8 alpha)
{
eigen_assert(alpha.value == 1);
eigen_assert(resIncr == 1);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
for (Index i = 0; i < rows; ++i) {
for (Index j = 0; j < cols; ++j) {
res[i] += lhs(i, j) * rhs(j, 0);
}
}
}
// Mat-Vec product
// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
{
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QUInt8 alpha);
};
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QUInt8 alpha)
{
eigen_assert(alpha.value == 1);
eigen_assert(resIncr == 1);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
for (Index i = 0; i < rows; ++i) {
for (Index j = 0; j < cols; ++j) {
res[i] += lhs(i, j) * rhs(j, 0);
}
}
}
// Mat-Vec product
// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
{
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QInt8 alpha);
};
template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsMapper& lhs,
const RhsMapper& rhs,
QInt32* res, Index resIncr,
QInt8 alpha)
{
eigen_assert(alpha.value == 1);
eigen_assert(resIncr == 1);
eigen_assert(rows > 0);
eigen_assert(cols > 0);
for (Index i = 0; i < rows; ++i) {
for (Index j = 0; j < cols; ++j) {
res[i] += lhs(i, j) * rhs(j, 0);
}
}
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
namespace Eigen {
namespace internal {
typedef struct Packet32q8i {
__m256i val;
operator __m256i() const { return val; }
Packet32q8i();
Packet32q8i(__m256i val) : val(val) {}
} Packet32q8i;
typedef struct Packet16q16i {
__m256i val;
operator __m256i() const { return val; }
Packet16q16i();
Packet16q16i(__m256i val) : val(val) {}
} Packet16q16i;
typedef struct Packet32q8u {
__m256i val;
operator __m256i() const { return val; }
Packet32q8u();
Packet32q8u(__m256i val) : val(val) {}
} Packet32q8u;
typedef struct Packet16q8i {
__m128i val;
operator __m128i() const { return val; }
Packet16q8i();
Packet16q8i(__m128i val) : val(val) {}
} Packet16q8i;
typedef struct Packet16q8u {
__m128i val;
operator __m128i() const { return val; }
Packet16q8u();
Packet16q8u(__m128i val) : val(val) {}
} Packet16q8u;
typedef struct Packet8q16i {
__m128i val;
operator __m128i() const { return val; }
Packet8q16i();
Packet8q16i(__m128i val) : val(val) {}
} Packet8q16i;
typedef struct Packet8q32i {
__m256i val;
operator __m256i() const { return val; }
Packet8q32i();
Packet8q32i(__m256i val) : val(val) {}
} Packet8q32i;
typedef struct Packet4q32i {
__m128i val;
operator __m128i() const { return val; }
Packet4q32i();
Packet4q32i(__m128i val) : val(val) {}
} Packet4q32i;
#ifndef EIGEN_VECTORIZE_AVX512
template <>
struct packet_traits<QInt8> : default_packet_traits {
typedef Packet32q8i type;
typedef Packet16q8i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 32,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QUInt8> : default_packet_traits {
typedef Packet32q8u type;
typedef Packet16q8u half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 32,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QInt16> : default_packet_traits {
typedef Packet16q16i type;
typedef Packet8q16i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QInt32> : default_packet_traits {
typedef Packet8q32i type;
typedef Packet4q32i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 8,
};
enum {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
#endif
template <>
struct unpacket_traits<Packet32q8i> {
typedef QInt8 type;
typedef Packet16q8i half;
enum { size = 32, alignment=Aligned32 };
};
template <>
struct unpacket_traits<Packet16q16i> {
typedef QInt16 type;
typedef Packet8q16i half;
enum { size = 16, alignment=Aligned32 };
};
template <>
struct unpacket_traits<Packet32q8u> {
typedef QUInt8 type;
typedef Packet16q8u half;
enum { size = 32, alignment=Aligned32 };
};
template <>
struct unpacket_traits<Packet8q32i> {
typedef QInt32 type;
typedef Packet4q32i half;
enum { size = 8, alignment=Aligned32 };
};
// Unaligned load
template <>
EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
}
// Aligned load
template <>
EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
}
// Unaligned store
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.val);
}
// Aligned store
template <>
EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.val);
}
// Extract first element.
template <>
EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template <>
EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
return _mm256_extract_epi16(a.val, 0);
}
template <>
EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
}
template <>
EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
return _mm256_extract_epi8(a.val, 0);
}
// Initialize to constant value.
template <>
EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
return _mm256_set1_epi8(from.value);
}
template <>
EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
}
template <>
EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
return _mm256_set1_epi32(from.value);
}
// Basic arithmetic packet ops for QInt32.
template <>
EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_add_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
return _mm256_set1_epi16(from.value);
}
template <>
EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_sub_epi32(a.val, b.val);
}
// Note: mullo truncates the result to 32 bits.
template <>
EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_mullo_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
}
// Min and max.
template <>
EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_min_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_max_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
const Packet16q16i& b) {
return _mm256_min_epi16(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
const Packet16q16i& b) {
return _mm256_max_epi16(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
const Packet32q8u& b) {
return _mm256_min_epu8(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
const Packet32q8u& b) {
return _mm256_max_epu8(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
const Packet32q8i& b) {
return _mm256_min_epi8(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
const Packet32q8i& b) {
return _mm256_max_epi8(a.val, b.val);
}
// Reductions.
template <>
EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
__m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return pfirst<Packet8q32i>(
_mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
__m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return pfirst<Packet8q32i>(
_mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
__m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
}
template <>
EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
__m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
}
template <>
EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
__m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
tmp = _mm256_min_epu8(tmp,
_mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return std::min(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
__m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
_mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
tmp = _mm256_max_epu8(tmp,
_mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return std::max(static_cast<uint8_t>(_mm256_extract_epi8(tmp, 0)),
static_cast<uint8_t>(_mm256_extract_epi8(tmp, 1)));
}
template <>
EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
__m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return std::min(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
}
template <>
EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
__m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
return std::max(_mm256_extract_epi8(tmp, 0), _mm256_extract_epi8(tmp, 1));
}
// Vectorized scaling of Packet32q8i by float.
template<>
struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
#else
scalar_product_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; }
EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const {
__m256d scale = _mm256_set1_pd(b);
__m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
__m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
__m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
__m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
}
};
template <>
struct functor_traits<scalar_product_op<QInt32, double>> {
enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
};
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
#include "PacketMathAVX2.h"
namespace Eigen {
namespace internal {
typedef struct Packet64q8i {
__m512i val;
operator __m512i() const { return val; }
Packet64q8i();
Packet64q8i(__m512i val) : val(val) {}
} Packet64q8i;
typedef struct Packet32q16i {
__m512i val;
operator __m512i() const { return val; }
Packet32q16i();
Packet32q16i(__m512i val) : val(val) {}
} Packet32q16i;
typedef struct Packet64q8u {
__m512i val;
operator __m512i() const { return val; }
Packet64q8u();
Packet64q8u(__m512i val) : val(val) {}
} Packet64q8u;
typedef struct Packet16q32i {
__m512i val;
operator __m512i() const { return val; }
Packet16q32i();
Packet16q32i(__m512i val) : val(val) {}
} Packet16q32i;
template <>
struct packet_traits<QInt8> : default_packet_traits {
typedef Packet64q8i type;
typedef Packet32q8i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 64,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QUInt8> : default_packet_traits {
typedef Packet64q8u type;
typedef Packet32q8u half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 64,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QInt16> : default_packet_traits {
typedef Packet32q16i type;
typedef Packet16q16i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 32,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct packet_traits<QInt32> : default_packet_traits {
typedef Packet16q32i type;
typedef Packet8q32i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
};
enum {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet64q8i> {
typedef QInt8 type;
typedef Packet32q8i half;
enum { size = 64, alignment=Aligned64 };
};
template <>
struct unpacket_traits<Packet32q16i> {
typedef QInt16 type;
typedef Packet16q16i half;
enum { size = 32, alignment=Aligned64 };
};
template <>
struct unpacket_traits<Packet64q8u> {
typedef QUInt8 type;
typedef Packet32q8u half;
enum { size = 64, alignment=Aligned64 };
};
template <>
struct unpacket_traits<Packet16q32i> {
typedef QInt32 type;
typedef Packet8q32i half;
enum { size = 16, alignment=Aligned64 };
};
// Unaligned load
template <>
EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
reinterpret_cast<const __m512i*>(from));
}
// Aligned load
template <>
EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
reinterpret_cast<const __m512i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
reinterpret_cast<const __m512i*>(from));
}
// Unaligned store
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
reinterpret_cast<__m512i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
reinterpret_cast<__m512i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
reinterpret_cast<__m512i*>(to), from.val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
reinterpret_cast<__m512i*>(to), from.val);
}
// Aligned store
template <>
EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
from.val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
from.val);
}
// Extract first element.
template <>
EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
}
template <>
EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
return static_cast<uint8_t>(
_mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
}
template <>
EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
}
template <>
EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
}
// Initialize to constant value.
template <>
EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
return _mm512_set1_epi8(from.value);
}
template <>
EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
return _mm512_set1_epi16(from.value);
}
template <>
EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
}
template <>
EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
return _mm512_set1_epi32(from.value);
}
// Basic arithmetic packet ops for QInt32.
template <>
EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
const Packet16q32i& b) {
return _mm512_add_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
const Packet16q32i& b) {
return _mm512_sub_epi32(a.val, b.val);
}
// Note: mullo truncates the result to 32 bits.
template <>
EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
const Packet16q32i& b) {
return _mm512_mullo_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
}
// Min and max.
template <>
EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
const Packet16q32i& b) {
return _mm512_min_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
const Packet16q32i& b) {
return _mm512_max_epi32(a.val, b.val);
}
template <>
EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
const Packet64q8u& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_min_epu8(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_min_epu8(ap0, bp0);
__m256i r1 = _mm256_min_epu8(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
const Packet64q8u& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_max_epu8(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_max_epu8(ap0, bp0);
__m256i r1 = _mm256_max_epu8(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
const Packet64q8i& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_min_epi8(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_min_epi8(ap0, bp0);
__m256i r1 = _mm256_min_epi8(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
const Packet32q16i& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_min_epi16(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_min_epi16(ap0, bp0);
__m256i r1 = _mm256_min_epi16(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
const Packet64q8i& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_max_epi8(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_max_epi8(ap0, bp0);
__m256i r1 = _mm256_max_epi8(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
const Packet32q16i& b) {
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_max_epi16(a.val, b.val);
#else
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
__m256i r0 = _mm256_max_epi16(ap0, bp0);
__m256i r1 = _mm256_max_epi16(ap1, bp1);
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
#endif
}
// Reductions.
template <>
EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(
_mm_min_epi32(
res,
_mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(
_mm_max_epi32(
res,
_mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::min({
static_cast<std::int16_t>(w >> 16),
static_cast<std::int16_t>(w)
});
}
template <>
EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::max({
static_cast<std::int16_t>(w >> 16),
static_cast<std::int16_t>(w)
});
}
template <>
EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::min({
static_cast<std::uint8_t>(w >> 24),
static_cast<std::uint8_t>(w >> 16),
static_cast<std::uint8_t>(w >> 8),
static_cast<std::uint8_t>(w)
});
}
template <>
EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::max({
static_cast<std::uint8_t>(w >> 24),
static_cast<std::uint8_t>(w >> 16),
static_cast<std::uint8_t>(w >> 8),
static_cast<std::uint8_t>(w)
});
}
template <>
EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::min({
static_cast<std::int8_t>(w >> 24),
static_cast<std::int8_t>(w >> 16),
static_cast<std::int8_t>(w >> 8),
static_cast<std::int8_t>(w)
});
}
template <>
EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
Packet4i res =
_mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
std::uint32_t w =
pfirst(
_mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
return std::min({
static_cast<std::int8_t>(w >> 24),
static_cast<std::int8_t>(w >> 16),
static_cast<std::int8_t>(w >> 8),
static_cast<std::int8_t>(w)
});
}
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
namespace Eigen {
namespace internal {
typedef __m256 Packet8f;
template <>
struct type_casting_traits<QInt32, float> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
return _mm256_cvtepi32_ps(a.val);
}
template <>
struct type_casting_traits<float, QInt32> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
return _mm256_cvtps_epi32(a);
}
template <>
struct type_casting_traits<QInt32, QInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet32q8i
pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
const Packet8q32i& c, const Packet8q32i& d) {
__m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val),
_mm256_packs_epi32(c.val, d.val));
// Since packs does not cross 128 bit lane boundaries,
// we have to permute to properly order the final result.
const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
return _mm256_permutevar8x32_epi32(converted, permute_mask);
}
template <>
struct type_casting_traits<QInt32, QUInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet32q8u
pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
const Packet8q32i& c, const Packet8q32i& d) {
const __m256i converted = _mm256_packus_epi16(
_mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val));
// Since packus does not cross 128 bit lane boundaries,
// we have to permute to properly order the final result.
const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
return _mm256_permutevar8x32_epi32(converted, permute_mask);
}
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
namespace Eigen {
namespace internal {
typedef __m512 Packet16f;
typedef __m512i Packet16i;
template <>
struct type_casting_traits<QInt32, float> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
return _mm512_cvtepi32_ps(a.val);
}
template <>
struct type_casting_traits<float, QInt32> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
return _mm512_cvtps_epi32(a);
}
template <>
struct type_casting_traits<float, QInt16> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet32q16i
pcast<Packet16f>(const Packet16f& a, const Packet16f& b) {
Packet16i a_int = _mm512_cvtps_epi32(a);
Packet16i b_int = _mm512_cvtps_epi32(b);
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_packs_epi32(a_int, b_int);
#else
Packet8i ab_int16_low =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_castsi512_si256(a_int),
_mm512_castsi512_si256(b_int)),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i ab_int16_high =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_extracti32x8_epi32(a_int, 1),
_mm512_extracti32x8_epi32(b_int, 1)),
_MM_SHUFFLE(0, 2, 1, 3));
return _mm512_inserti32x8(
_mm512_castsi256_si512(ab_int16_low),
ab_int16_high, 1);
#endif
}
template <>
struct type_casting_traits<float, QInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet64q8i
pcast<Packet16f>(const Packet16f& a,
const Packet16f& b,
const Packet16f& c,
const Packet16f& d) {
Packet16i a_int = _mm512_cvtps_epi32(a);
Packet16i b_int = _mm512_cvtps_epi32(b);
Packet16i c_int = _mm512_cvtps_epi32(c);
Packet16i d_int = _mm512_cvtps_epi32(d);
#ifdef EIGEN_VECTORIZE_AVX512BW
return _mm512_packs_epi16(
_mm512_packs_epi32(a_int, b_int),
_mm512_packs_epi32(c_int, d_int));
#else
Packet8i ab_int16_low =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_castsi512_si256(a_int),
_mm512_castsi512_si256(b_int)),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i cd_int16_low =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_castsi512_si256(c_int),
_mm512_castsi512_si256(d_int)),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i ab_int16_high =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_extracti32x8_epi32(a_int, 1),
_mm512_extracti32x8_epi32(b_int, 1)),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i cd_int16_high =
_mm256_permute4x64_epi64(
_mm256_packs_epi32(
_mm512_extracti32x8_epi32(c_int, 1),
_mm512_extracti32x8_epi32(d_int, 1)),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i abcd_int8_low =
_mm256_permute4x64_epi64(
_mm256_packs_epi16(ab_int16_low, cd_int16_low),
_MM_SHUFFLE(0, 2, 1, 3));
Packet8i abcd_int8_high =
_mm256_permute4x64_epi64(
_mm256_packs_epi16(ab_int16_high, cd_int16_high),
_MM_SHUFFLE(0, 2, 1, 3));
return _mm512_inserti32x8(
_mm512_castsi256_si512(abcd_int8_low),
abcd_int8_high, 1);
#endif
}
template <>
struct type_casting_traits<QInt32, QInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
struct type_casting_traits<QInt32, QInt16> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet64q8i
pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
const Packet16q32i& b,
const Packet16q32i& c,
const Packet16q32i& d) {
__m512i converted = _mm512_packs_epi16(_mm512_packs_epi32(a.val, b.val),
_mm512_packs_epi32(c.val, d.val));
return converted;
}
template <>
EIGEN_STRONG_INLINE Packet32q16i
pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
const Packet16q32i& b) {
__m512i converted = _mm512_packs_epi32(a.val, b.val);
return converted;
}
template <>
struct type_casting_traits<QInt32, QUInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet64q8u
pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
const Packet16q32i& c, const Packet16q32i& d) {
const __m512i converted = _mm512_packus_epi16(
_mm512_packus_epi32(a.val, b.val), _mm512_packus_epi32(c.val, d.val));
return converted;
}
template <>
struct type_casting_traits<QInt32, QUInt16> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
};
#if 0
template <>
EIGEN_STRONG_INLINE Packet32q16u
pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
const Packet16q32i& b) {
const __m512i converted = _mm512_packus_epi32(a.val, b.val);
return converted;
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
namespace Eigen {
/** scalar_sigmoid_fast_derivative_op
* \ingroup CXX11_NeuralNetworks_Module
* \brief Template functor to compute the fast derivative of a sigmoid
*
* Input should be the backpropagated gradient.
*
* \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
*/
template <typename T>
struct scalar_sigmoid_fast_derivative_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
const T one = T(1);
return (one - y) * y;
}
template <typename Packet>
inline Packet packetOp(const Packet& y) const {
const Packet one = internal::pset1<Packet>(1);
return internal::pmul(internal::psub(one, y), y);
}
};
namespace internal {
template <typename T>
struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
enum {
Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost,
PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
packet_traits<T>::HasNegate
};
};
} // namespace internal
/** scalar_tanh_fast_derivative_op
* \ingroup CXX11_NeuralNetworks_Module
* \brief Template functor to compute the fast derivative of a tanh
*
* Input should be the backpropagated gradient.
*
* \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
*/
template <typename T>
struct scalar_tanh_fast_derivative_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
const T one = T(1);
return one - (y * y);
}
template <typename Packet>
inline Packet packetOp(const Packet& y) const {
const Packet one = internal::pset1<Packet>(1);
return internal::psub(one, internal::pmul(y, y));
}
};
namespace internal {
template <typename T>
struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
enum {
Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1,
PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
packet_traits<T>::HasNegate
};
};
} // namespace internal
/**
* \ingroup CXX11_NeuralNetworks_Module
* \brief Template functor to clip the magnitude of the first scalar.
*
* \sa class CwiseBinaryOp, MatrixBase::Clip
*/
template <typename Scalar>
struct scalar_clip_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
operator()(const Scalar& a, const Scalar& b) const {
return numext::mini(numext::maxi(a, -b), b);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
packetOp(const Packet& a, const Packet& b) const {
return internal::pmin(internal::pmax(a, internal::pnegate(b)), b);
}
};
namespace internal {
template <typename Scalar>
struct functor_traits<scalar_clip_op<Scalar> > {
enum {
Cost = NumTraits<Scalar>::AddCost * 3,
PacketAccess = packet_traits<Scalar>::HasMax &&
packet_traits<Scalar>::HasMin &&
packet_traits<Scalar>::HasNegate
};
};
} // namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
namespace Eigen {
/** ExtractGlimpses
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Extract glimpses from an input tensor.
*
* The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
* The width and height parameters specify the extension of the returned glimpses.
* The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
* The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
* The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
*
* The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
* The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
*/
namespace {
template <typename Index>
struct GlimpseExtractionOp {
GlimpseExtractionOp(const Index width, const Index height,
const std::vector<IndexPair<float> >& offsets,
const bool normalized,
const bool centered,
const bool uniform_noise) :
width_(width), height_(height), offsets_(offsets),
normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
template <typename Input>
DSizes<Index, 4> dimensions(const Input& input) const {
typedef typename internal::traits<Input>::Index IndexType;
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
internal::traits<Input>::Layout, IndexType> > Ref;
Ref in(input);
DSizes<Index, 4> dims = in.dimensions();
dims[0] = in.dimension(0);
dims[1] = width_;
dims[2] = height_;
dims[3] = in.dimension(3);
return dims;
}
template <typename Input, typename Output, typename Device>
EIGEN_DEVICE_FUNC
void eval(const Input& input, Output& output, const Device& device) const
{
typedef typename internal::traits<Input>::Index IndexType;
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
internal::traits<Input>::Layout, IndexType> > Ref;
Ref in(input);
const Index num_channels = in.dimension(0);
const Index input_width = in.dimension(1);
const Index input_height = in.dimension(2);
const Index batch_size = in.dimension(3);
eigen_assert(input_width > 0);
eigen_assert(input_height > 0);
for (Index i = 0; i < batch_size; ++i) {
float x = offsets_[i].first, y = offsets_[i].second;
// Un-normalize coordinates back to pixel space if normalized.
if (normalized_) {
x *= input_width;
y *= input_height;
}
// Un-center if coordinates are centered on the image center.
if (centered_) {
x /= 2.0f;
y /= 2.0f;
x += input_width / 2.0f;
y += input_height / 2.0f;
}
// Remove half of the glimpse window.
x -= width_ / 2.0f;
y -= height_ / 2.0f;
const Index offset_x = (Index) x;
const Index offset_y = (Index) y;
Index glimpse_width = width_;
Index glimpse_height = height_;
bool partial_overlap = false;
DSizes<Index, 3> slice_offset(0, offset_x, offset_y);
DSizes<Index, 3> slice_extent(num_channels, width_, height_);
DSizes<Index, 3> base_offset(0, 0, 0);
if (offset_x < 0) {
slice_offset[1] = 0;
glimpse_width = (std::max<Index>)(0, width_ + offset_x);
slice_extent[1] = glimpse_width;
base_offset[1] = width_ - glimpse_width;
partial_overlap = true;
} else if (offset_x + width_ >= input_width) {
glimpse_width = (std::max<Index>)(0, input_width - offset_x);
slice_extent[1] = glimpse_width;
partial_overlap = true;
}
if (offset_y < 0) {
slice_offset[2] = 0;
glimpse_height = (std::max<Index>)(0, height_ + offset_y);
slice_extent[2] = glimpse_height;
base_offset[2] = height_ - glimpse_height;
partial_overlap = true;
} else if (offset_y + height_ >= input_height) {
glimpse_height = (std::max<Index>)(0, input_height - offset_y);
slice_extent[2] = glimpse_height;
partial_overlap = true;
}
slice_extent[1] = std::min<Index>(input_width, slice_extent[1]);
slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
if (partial_overlap) {
if (uniform_noise_) {
// Initialize the glimpse with uniform noise.
typedef typename internal::remove_const<
typename internal::traits<Input>::Scalar>::type Scalar;
TensorFixedSize<Scalar, Sizes<> > mini;
mini.device(device) = input.template chip<3>(i).minimum();
TensorFixedSize<float, Sizes<> > range;
range.device(device) =
(input.template chip<3>(i).maximum() - mini).template cast<float>();
DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
output.template chip<3>(i).device(device) =
mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) +
(tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast<Scalar>();
} else {
// Initialize the glimpse with white noise: compute the mean and sigma
// of each channel, and use them to shape the gaussian.
DSizes<Index, 2> glimpse_size(width_, height_);
DSizes<Index, 2> input_size(input_width, input_height);
typedef typename internal::remove_const<
typename internal::traits<Input>::Scalar>::type Scalar;
for (int j = 0; j < num_channels; ++j) {
TensorFixedSize<Scalar, Sizes<> > mean;
mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast<float>().mean();
TensorFixedSize<float, Sizes<> > sigma;
sigma.device(device) =
(input.template chip<3>(i).template chip<0>(j).template cast<float>() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt();
TensorFixedSize<Scalar, Sizes<> > mini;
mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum();
TensorFixedSize<float, Sizes<> > maxi;
maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum();
TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
output.template chip<3>(i).template chip<0>(j).device(device) =
(mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) +
(tmp.random(internal::NormalRandomGenerator<float>()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast<Scalar>()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size));
}
}
// Copy the part of the glimpse that cover the input image if any.
if (glimpse_width == 0 || glimpse_height == 0) {
continue;
}
output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
} else {
output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
}
}
}
private:
const Index width_;
const Index height_;
const std::vector<IndexPair<float> > offsets_;
const bool normalized_;
const bool centered_;
const bool uniform_noise_;
};
}
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
ExtractGlimpses(const Input& input,
const typename internal::traits<Input>::Index width,
const typename internal::traits<Input>::Index height,
const std::vector<IndexPair<float> >& offsets,
const bool normalized = true, const bool centered = true,
const bool uniform_noise = true)
{
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
typedef typename internal::traits<Input>::Index Index;
const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
centered, uniform_noise);
return input.customOp(op);
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
#include "Patch3d.h"
namespace Eigen {
/** CuboidConvolutionBackwardInput
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Computes the backprop for the input of a 3D convolution.
*
* The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
* The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
* output_backward and kernel have to be in the same layout.
*
* The dimensions of the result will be filters, depth, height, width (and others if applicable).
*
* It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
*
* All dimension orders above are given for col-major, and should be reversed for row-major.
*/
template <typename OutputBackward, typename Kernel>
EIGEN_ALWAYS_INLINE static const typename internal::conditional<
internal::traits<OutputBackward>::Layout == ColMajor,
TensorReshapingOp<
const DSizes<typename internal::traits<OutputBackward>::Index,
internal::traits<OutputBackward>::NumDimensions>,
const TensorContractionOp<
const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
const TensorReshapingOp<
const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
const TensorReverseOp<const array<bool, 5>, const Kernel>
>,
const TensorReshapingOp<
const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
>
>
>,
TensorReshapingOp<
const DSizes<typename internal::traits<OutputBackward>::Index,
internal::traits<OutputBackward>::NumDimensions>,
const TensorContractionOp<
const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
const TensorReshapingOp<
const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
>,
const TensorReshapingOp<
const DSizes<typename internal::traits<OutputBackward>::Index, 3>,
const TensorReverseOp<const array<bool, 5>, const Kernel>
>
>
>
>::type
CuboidConvolutionBackwardInput(
const Kernel& kernel, const OutputBackward& output_backward,
typename internal::traits<OutputBackward>::Index inputPlanes,
typename internal::traits<OutputBackward>::Index inputRows,
typename internal::traits<OutputBackward>::Index inputCols,
const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1,
const DenseIndex strideCols = 1) {
typedef typename internal::traits<OutputBackward>::Index TensorIndex;
const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
const TensorRef<const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
// Number of filters to apply. This is the same as the output depth of the result
const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
// Number of channels. This is the same as the input depth.
const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2];
const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3];
const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4];
TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
// Infer padding type.
if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
// SAME padding.
const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
forward_pad_z = dz - dz / 2;
forward_pad_y = dy - dy / 2;
forward_pad_x = dx - dx / 2;
} else {
// VALID padding.
forward_pad_z = 0;
forward_pad_y = 0;
forward_pad_x = 0;
}
const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
eigen_assert(padding_ztop >= 0);
eigen_assert(padding_zbottom >= 0);
eigen_assert(padding_top >= 0);
eigen_assert(padding_left >= 0);
eigen_assert(padding_bottom >= 0);
eigen_assert(padding_right >= 0);
// The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols.
// We need to reverse the kernel along the spatial dimensions.
array<bool, 5> kernel_reverse;
if (isColMajor) {
kernel_reverse[0] = false;
kernel_reverse[1] = false;
kernel_reverse[2] = true;
kernel_reverse[3] = true;
kernel_reverse[4] = true;
} else {
kernel_reverse[0] = true;
kernel_reverse[1] = true;
kernel_reverse[2] = true;
kernel_reverse[3] = false;
kernel_reverse[4] = false;
}
DSizes<TensorIndex, 3> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelFilters;
kernel_dims[1] = kernelChannels;
kernel_dims[2] = kernelRows * kernelCols * kernelPlanes;
} else {
kernel_dims[0] = kernelRows * kernelCols * kernelPlanes;
kernel_dims[1] = kernelChannels;
kernel_dims[2] = kernelFilters;
}
// The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS
// When we extract the image patches from output_backward, it will have dimensions:
// out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS)
DSizes<TensorIndex, 3> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelFilters;
pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
for (int i = 4; i < NumDims; ++i) {
pre_contract_dims[2] *= out.dimension(i);
}
} else {
pre_contract_dims[2] = kernelFilters;
pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
pre_contract_dims[0] = inputRows * inputCols * inputPlanes;
for (int i = 0; i < NumDims - 4; ++i) {
pre_contract_dims[0] *= out.dimension(i);
}
}
// We will contract along dimensions (0, 2) in kernel and (0, 1) in
// output_backward, if this is col-major, and
// dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
array<IndexPair<TensorIndex>, 2> contract_dims;
if (isColMajor) {
// col-major: kernel.contract(output.patches)
contract_dims[0] = IndexPair<TensorIndex>(0, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 1);
} else {
// row-major: output.patches.contract(kernel)
contract_dims[0] = IndexPair<TensorIndex>(1, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 2);
}
// Post contraction, the dimensions of the input_backprop is
// channels X input_planes X input_rows X input_cols X OTHERS
DSizes<TensorIndex, NumDims> post_contract_dims;
if (isColMajor) {
post_contract_dims[0] = kernelChannels;
post_contract_dims[1] = inputPlanes;
post_contract_dims[2] = inputRows;
post_contract_dims[3] = inputCols;
for (int i = 4; i < NumDims; ++i) {
post_contract_dims[i] = out.dimension(i);
}
} else {
post_contract_dims[NumDims - 1] = kernelChannels;
post_contract_dims[NumDims - 2] = inputPlanes;
post_contract_dims[NumDims - 3] = inputRows;
post_contract_dims[NumDims - 4] = inputCols;
for (int i = 0; i < NumDims - 4; ++i) {
post_contract_dims[i] = out.dimension(i);
}
}
DSizes<TensorIndex, NumDims> strides;
for (int i = 0; i < NumDims; i++) {
strides[i] = 1;
}
if (isColMajor) {
strides[1] = stridePlanes;
strides[2] = strideRows;
strides[3] = strideCols;
} else {
strides[NumDims - 2] = stridePlanes;
strides[NumDims - 3] = strideRows;
strides[NumDims - 4] = strideCols;
}
return choose(
Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
kernel.reverse(kernel_reverse)
.reshape(kernel_dims)
.contract(
output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
1, 1, 1, stridePlanes, strideRows, strideCols,
padding_ztop, padding_zbottom,
padding_top, padding_bottom,
padding_left, padding_right)
.reshape(pre_contract_dims),
contract_dims)
.reshape(post_contract_dims),
output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
1, 1, 1, stridePlanes, strideRows, strideCols,
padding_ztop, padding_zbottom,
padding_top, padding_bottom,
padding_left, padding_right)
.reshape(pre_contract_dims)
.contract(kernel.reverse(kernel_reverse).reshape(kernel_dims),
contract_dims)
.reshape(post_contract_dims));
}
/** CuboidConvolutionBackwardKernel
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Computes the backprop for the filter of a 3D convolution.
*
* The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
* The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
* output_backward and kernel have to be in the same layout.
*
* The dimensions of the result will be filters, depth, height, width (and others if applicable).
*
* It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
*
* All dimension orders above are given for col-major, and should be reversed for row-major.
*/
template <typename OutputBackward, typename Input>
EIGEN_ALWAYS_INLINE static const typename internal::conditional<
internal::traits<OutputBackward>::Layout == ColMajor,
const TensorShufflingOp<
const array<typename internal::traits<OutputBackward>::Index, 5>,
const TensorReverseOp<
const array<bool, 5>,
const TensorReshapingOp<
const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
const TensorContractionOp<
const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 3>,
const Input>,
const TensorReshapingOp<
const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
>
>
>
>
>,
const TensorShufflingOp<
const array<typename internal::traits<OutputBackward>::Index, 5>,
const TensorReverseOp<
const array<bool, 5>,
const TensorReshapingOp<
const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
const TensorContractionOp<
const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
const TensorReshapingOp<
const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 3>,
const Input
>
>
>
>
>
>::type
CuboidConvolutionBackwardKernel(
const Input& input, const OutputBackward& output_backward,
typename internal::traits<Input>::Index kernelPlanes,
typename internal::traits<Input>::Index kernelRows,
typename internal::traits<Input>::Index kernelCols,
const DenseIndex stridePlanes = 1,
const DenseIndex strideRows = 1,
const DenseIndex strideCols = 1) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2);
const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3);
const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
// Infer padding type.
if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
// SAME padding.
const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
forward_pad_z = dz - dz / 2;
forward_pad_y = dy - dy / 2;
forward_pad_x = dx - dx / 2;
} else {
// VALID padding.
forward_pad_z = 0;
forward_pad_y = 0;
forward_pad_x = 0;
}
const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
eigen_assert(padding_ztop >= 0);
eigen_assert(padding_zbottom >= 0);
eigen_assert(padding_top >= 0);
eigen_assert(padding_left >= 0);
eigen_assert(padding_bottom >= 0);
eigen_assert(padding_right >= 0);
// The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS
// When we extract the image patches from output_backward (with input as the
// kernel), it will have dimensions
// (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS
DSizes<TensorIndex, 4> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelFilters;
pre_contract_dims[1] = inputRows * inputCols * inputPlanes;
pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes;
pre_contract_dims[3] = 1;
for (int i = 4; i < NumDims; ++i) {
pre_contract_dims[3] *= out.dimension(i);
}
} else {
pre_contract_dims[3] = kernelFilters;
pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
pre_contract_dims[0] = 1;
for (int i = 0; i < NumDims - 4; ++i) {
pre_contract_dims[0] *= out.dimension(i);
}
}
// The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS
DSizes<TensorIndex, 3> input_dims;
if (isColMajor) {
input_dims[0] = kernelChannels;
input_dims[1] = inputRows * inputCols * inputPlanes;
input_dims[2] = 1;
for (int i = 4; i < NumDims; ++i) {
input_dims[2] *= in.dimension(i);
}
eigen_assert(input_dims[2] == pre_contract_dims[3]);
} else {
input_dims[2] = kernelChannels;
input_dims[1] = inputRows * inputCols * inputPlanes;
input_dims[0] = 1;
for (int i = 0; i < NumDims - 4; ++i) {
input_dims[0] *= in.dimension(i);
}
eigen_assert(input_dims[0] == pre_contract_dims[0]);
}
// We will contract along dimensions (1, 2) in in and (1, 3) in out, if
// this is col-major.
// For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
array<IndexPair<TensorIndex>, 2> contract_dims;
if (isColMajor) {
// col-major: in.contract(output.patches)
contract_dims[0] = IndexPair<TensorIndex>(1, 1);
contract_dims[1] = IndexPair<TensorIndex>(2, 3);
} else {
// row-major: output.patches.contract(in)
contract_dims[0] = IndexPair<TensorIndex>(0, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 1);
}
// After the contraction, the kernel will have dimension
// in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols
// We will need to shuffle the first two dimensions and reverse the spatial dimensions.
// The end shape is:
// out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols
// This is the shape of the kernel *before* the shuffling.
DSizes<TensorIndex, 5> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelChannels;
kernel_dims[1] = kernelFilters;
kernel_dims[2] = kernelPlanes;
kernel_dims[3] = kernelRows;
kernel_dims[4] = kernelCols;
} else {
kernel_dims[0] = kernelCols;
kernel_dims[1] = kernelRows;
kernel_dims[2] = kernelPlanes;
kernel_dims[3] = kernelFilters;
kernel_dims[4] = kernelChannels;
}
// Flip filters and channels.
array<TensorIndex, 5> kernel_shuffle;
if (isColMajor) {
kernel_shuffle[0] = 1;
kernel_shuffle[1] = 0;
kernel_shuffle[2] = 2;
kernel_shuffle[3] = 3;
kernel_shuffle[4] = 4;
} else {
kernel_shuffle[0] = 0;
kernel_shuffle[1] = 1;
kernel_shuffle[2] = 2;
kernel_shuffle[3] = 4;
kernel_shuffle[4] = 3;
}
// Reverse the spatial dimensions.
array<bool, 5> kernel_reverse;
if (isColMajor) {
kernel_reverse[0] = false;
kernel_reverse[1] = false;
kernel_reverse[2] = true;
kernel_reverse[3] = true;
kernel_reverse[4] = true;
} else {
kernel_reverse[0] = true;
kernel_reverse[1] = true;
kernel_reverse[2] = true;
kernel_reverse[3] = false;
kernel_reverse[4] = false;
}
DSizes<TensorIndex, NumDims> strides;
for (int i = 0; i < NumDims; i++) {
strides[i] = 1;
}
if (isColMajor) {
strides[1] = stridePlanes;
strides[2] = strideRows;
strides[3] = strideCols;
} else {
strides[NumDims - 2] = stridePlanes;
strides[NumDims - 3] = strideRows;
strides[NumDims - 4] = strideCols;
}
return choose(
Cond<internal::traits<Input>::Layout == ColMajor>(),
input.reshape(input_dims)
.contract(
output_backward.extract_volume_patches(
inputPlanes, inputRows, inputCols, 1,
1, 1, stridePlanes, strideRows, strideCols,
padding_ztop, padding_zbottom, padding_top,
padding_bottom, padding_left, padding_right)
.reshape(pre_contract_dims),
contract_dims)
.reshape(kernel_dims)
.reverse(kernel_reverse)
.shuffle(kernel_shuffle),
output_backward.extract_volume_patches(
inputPlanes, inputRows, inputCols, 1, 1, 1,
stridePlanes, strideRows, strideCols, padding_ztop,
padding_zbottom, padding_top, padding_bottom,
padding_left, padding_right)
.reshape(pre_contract_dims)
.contract(input.reshape(input_dims), contract_dims)
.reshape(kernel_dims)
.reverse(kernel_reverse)
.shuffle(kernel_shuffle));
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
namespace Eigen {
/** SpatialConvolutionBackwardInput
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Computes the backprop for the input of a 2D convolution.
*
* The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
* The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
* The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
*
* If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
*
* The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
*
* It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
*
*/
template <typename OutputBackward, typename Kernel>
EIGEN_ALWAYS_INLINE
static const typename internal::conditional<
internal::traits<OutputBackward>::Layout == ColMajor,
TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > >,
TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> > > > >::type
SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputRows, typename internal::traits<OutputBackward>::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
typedef typename internal::traits<OutputBackward>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
// Number of filters to apply. This is the same as the output depth of the result
const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
// Number of channels. This is the same as the input depth.
const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
// This is the effective kernel size, taking into account the (in_stride - 1) zero-values
// inserted between consecutive kernel elements in atrous convolution
const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
// Computing the forward padding
const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
eigen_assert(padding_top >= 0);
eigen_assert(padding_left >= 0);
eigen_assert(padding_bottom >= 0);
eigen_assert(padding_right >= 0);
// The kernel has dimensions filters X channels X patch_rows X patch_cols
// We need to reverse the kernel along dimensions corresponding to rows and
// cols.
// TODO(yangke): we can make things slightly faster by collapsing the dimensions
// where we don't reverse. Try that once we have a faster compiler.
array<bool, 4> kernel_reverse;
if (isColMajor) {
kernel_reverse[0] = false;
kernel_reverse[1] = false;
kernel_reverse[2] = true;
kernel_reverse[3] = true;
} else {
kernel_reverse[0] = true;
kernel_reverse[1] = true;
kernel_reverse[2] = false;
kernel_reverse[3] = false;
}
DSizes<TensorIndex, 3> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelFilters;
kernel_dims[1] = kernelChannels;
kernel_dims[2] = kernelRows * kernelCols;
} else {
kernel_dims[0] = kernelRows * kernelCols;
kernel_dims[1] = kernelChannels;
kernel_dims[2] = kernelFilters;
}
// The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
// When we extract the image patches from output_backward, it will have dimensions
// out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS)
DSizes<TensorIndex, 3> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelFilters;
pre_contract_dims[1] = kernelRows * kernelCols;
pre_contract_dims[2] = inputRows * inputCols;
for (int i = 3; i < NumDims; ++i) {
pre_contract_dims[2] *= out.dimension(i);
}
} else {
pre_contract_dims[2] = kernelFilters;
pre_contract_dims[1] = kernelRows * kernelCols;
pre_contract_dims[0] = inputRows * inputCols;
for (int i = 0; i < NumDims - 3; ++i) {
pre_contract_dims[0] *= out.dimension(i);
}
}
// We will contract along dimensions (0, 2) in kernel and (0, 1) in
// output_backward, if this is col-major, and
// dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
array<IndexPair<TensorIndex>, 2> contract_dims;
if (isColMajor) {
// col-major: kernel.contract(output.patches)
contract_dims[0] = IndexPair<TensorIndex>(0, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 1);
} else {
// row-major: output.patches.contract(kernel)
contract_dims[0] = IndexPair<TensorIndex>(1, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 2);
}
// Post contraction, the dimensions of the input_backprop is
// channels X input_rows X input_cols X OTHERS
DSizes<TensorIndex, NumDims> post_contract_dims;
if (isColMajor) {
post_contract_dims[0] = kernelChannels;
post_contract_dims[1] = inputRows;
post_contract_dims[2] = inputCols;
for (int i = 3; i < NumDims; ++i) {
post_contract_dims[i] = out.dimension(i);
}
} else {
post_contract_dims[NumDims - 1] = kernelChannels;
post_contract_dims[NumDims - 2] = inputRows;
post_contract_dims[NumDims - 3] = inputCols;
for (int i = 0; i < NumDims - 3; ++i) {
post_contract_dims[i] = out.dimension(i);
}
}
return choose(Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
}
/** SpatialConvolutionBackwardKernel
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Computes the backprop for the filter of a 2D convolution.
*
* The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
* The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
* The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
*
* If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
*
* The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
*
* It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
*
*/
// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape().
// This can significantly accelerate SpatialConvolutionBackwardKernel.
template <typename OutputBackward, typename Input>
EIGEN_ALWAYS_INLINE
static const typename internal::conditional<
internal::traits<OutputBackward>::Layout == ColMajor,
const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > > > > >,
const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input> > > > > >::type
SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelRows, typename internal::traits<Input>::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
// stride and in_stride cannot both be larger than 1
eigen_assert(!(stride > 1 && in_stride > 1));
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
// Number of filters to apply. This is the same as the output depth of the result
const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1];
// Number of channels. This is the same as the input depth.
const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1];
// This is the effective kernel size, taking into account the (in_stride - 1) zero-values
// inserted between consecutive kernel elements in atrous convolution
const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
// Computing the forward padding
const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
// TODO: factor out the padding computation.
const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
eigen_assert(padding_top >= 0);
eigen_assert(padding_left >= 0);
eigen_assert(padding_bottom >= 0);
eigen_assert(padding_right >= 0);
// The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
// When we extract the image patches from output_backward (with input as the
// kernel), it will have dimensions
// (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS
DSizes<TensorIndex, 4> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelFilters;
pre_contract_dims[1] = inputRows * inputCols;
pre_contract_dims[2] = kernelRows * kernelCols;
pre_contract_dims[3] = 1;
for (int i = 3; i < NumDims; ++i) {
pre_contract_dims[3] *= out.dimension(i);
}
} else {
pre_contract_dims[3] = kernelFilters;
pre_contract_dims[2] = inputRows * inputCols;
pre_contract_dims[1] = kernelRows * kernelCols;
pre_contract_dims[0] = 1;
for (int i = 0; i < NumDims - 3; ++i) {
pre_contract_dims[0] *= out.dimension(i);
}
}
// The input has dimensions in_depth X (input_rows * input_cols) X OTHERS
DSizes<TensorIndex, 3> input_dims;
if (isColMajor) {
input_dims[0] = kernelChannels;
input_dims[1] = inputRows * inputCols;
input_dims[2] = 1;
for (int i = 3; i < NumDims; ++i) {
input_dims[2] *= in.dimension(i);
}
eigen_assert(input_dims[2] == pre_contract_dims[3]);
} else {
input_dims[2] = kernelChannels;
input_dims[1] = inputRows * inputCols;
input_dims[0] = 1;
for (int i = 0; i < NumDims - 3; ++i) {
input_dims[0] *= in.dimension(i);
}
eigen_assert(input_dims[0] == pre_contract_dims[0]);
}
// We will contract along dimensions (1, 2) in in and (1, 3) in out, if
// this is col-major.
// For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
array<IndexPair<TensorIndex>, 2> contract_dims;
if (isColMajor) {
// col-major: in.contract(output.patches)
contract_dims[0] = IndexPair<TensorIndex>(1, 1);
contract_dims[1] = IndexPair<TensorIndex>(2, 3);
} else {
// row-major: output.patches.contract(in)
contract_dims[0] = IndexPair<TensorIndex>(0, 0);
contract_dims[1] = IndexPair<TensorIndex>(2, 1);
}
// After the contraction, the kernel will have dimension
// in_depth X out_depth X kernel_rows X kernel_cols
// We will need to shuffle the first two dimensions and reverse the latter
// two dimensions.
// The end shape is
// out_depth X in_shape X kernel_rows X kernel_cols
// This is the shape of the kernel *before* the shuffling.
DSizes<TensorIndex, 4> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelChannels;
kernel_dims[1] = kernelFilters;
kernel_dims[2] = kernelRows;
kernel_dims[3] = kernelCols;
} else {
kernel_dims[0] = kernelCols;
kernel_dims[1] = kernelRows;
kernel_dims[2] = kernelFilters;
kernel_dims[3] = kernelChannels;
}
array<TensorIndex, 4> kernel_shuffle;
if (isColMajor) {
kernel_shuffle[0] = 1;
kernel_shuffle[1] = 0;
kernel_shuffle[2] = 2;
kernel_shuffle[3] = 3;
} else {
kernel_shuffle[0] = 0;
kernel_shuffle[1] = 1;
kernel_shuffle[2] = 3;
kernel_shuffle[3] = 2;
}
array<bool, 4> kernel_reverse;
if (isColMajor) {
kernel_reverse[0] = false;
kernel_reverse[1] = false;
kernel_reverse[2] = true;
kernel_reverse[3] = true;
} else {
kernel_reverse[0] = true;
kernel_reverse[1] = true;
kernel_reverse[2] = false;
kernel_reverse[3] = false;
}
return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle),
output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle));
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
#include "Patch3d.h"
namespace Eigen {
/** CuboidConvolution
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies a 3D convolution over a multichannel input voxel block.
*
* The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others).
* The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width).
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable).
*
* The input and kernel have to be in the same layout, and both row-major and
* col-major are supported. The shapes given above are for col-major layout.
* For row-major, all dimensions should be reversed.
*
* It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output.
*/
template <typename Input, typename Kernel>
EIGEN_ALWAYS_INLINE
static const typename internal::conditional <
internal::traits<Input>::Layout == ColMajor,
TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions>,
const TensorContractionOp<
const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 2>,
const Kernel>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 2>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
const Input> > > >,
TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions>,
const TensorContractionOp<
const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 2>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
const Input> > ,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index, 2>,
const Kernel> > > >::type
CuboidConvolution(const Input& input, const Kernel& kernel,
const DenseIndex stridePlanes = 1,
const DenseIndex strideRows = 1,
const DenseIndex strideCols = 1,
const PaddingType padding_type = PADDING_SAME) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
// Number of filters to apply. This is the same as the output depth of the result.
const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
// Spatial size of the kernel.
const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
if (isColMajor) {
eigen_assert(kernelChannels == in.dimension(0));
} else {
eigen_assert(kernelChannels == in.dimension(NumDims - 1));
}
const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
const float stride_planes_f = static_cast<float>(stridePlanes);
const float stride_rows_f = static_cast<float>(strideRows);
const float stride_cols_f = static_cast<float>(strideCols);
TensorIndex out_depth;
TensorIndex out_height;
TensorIndex out_width;
switch (padding_type) {
case PADDING_VALID:
out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f);
out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f);
out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f);
break;
case PADDING_SAME:
out_depth = ceil(inputPlanes / stride_planes_f);
out_height = ceil(inputRows / stride_rows_f);
out_width = ceil(inputCols / stride_cols_f);
break;
default:
eigen_assert(false && "unexpected padding");
}
DSizes<TensorIndex, 2> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelFilters;
kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
} else {
kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
kernel_dims[1] = kernelFilters;
}
// Molds the output of the patch extraction result into a 2D tensor:
// - the first dimension (dims[0]): the patch values to be multiplied with the kernels
// - the second dimension (dims[1]): everything else
DSizes<TensorIndex, 2> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
pre_contract_dims[1] = out_depth * out_height * out_width;
for (int i = 4; i < NumDims; ++i) {
pre_contract_dims[1] *= in.dimension(i);
}
} else {
pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
pre_contract_dims[0] = out_depth * out_height * out_width;
for (int i = 0; i < NumDims - 4; ++i) {
pre_contract_dims[0] *= in.dimension(i);
}
}
array<IndexPair<TensorIndex>, 1> contract_dims;
contract_dims[0] = IndexPair<TensorIndex>(1, 0);
// Molds the output of the contraction into the shape expected by the user
// (assuming ColMajor):
// - 1st dim: kernel filters
// - 2nd dim: output depth
// - 3nd dim: output height
// - 4rd dim: output width
// - 5th dim and beyond: everything else including batch size
DSizes<TensorIndex, NumDims> post_contract_dims;
if (isColMajor) {
post_contract_dims[0] = kernelFilters;
post_contract_dims[1] = out_depth;
post_contract_dims[2] = out_height;
post_contract_dims[3] = out_width;
for (int i = 4; i < NumDims; ++i) {
post_contract_dims[i] = in.dimension(i);
}
} else {
post_contract_dims[NumDims - 1] = kernelFilters;
post_contract_dims[NumDims - 2] = out_depth;
post_contract_dims[NumDims - 3] = out_height;
post_contract_dims[NumDims - 4] = out_width;
for (int i = 0; i < NumDims - 4; ++i) {
post_contract_dims[i] = in.dimension(i);
}
}
return choose(
Cond<internal::traits<Input>::Layout == ColMajor>(),
kernel.reshape(kernel_dims)
.contract(input.extract_volume_patches(
kernelDepth, kernelRows, kernelCols, stridePlanes,
strideRows, strideCols, padding_type)
.reshape(pre_contract_dims),
contract_dims)
.reshape(post_contract_dims),
input.extract_volume_patches(kernelDepth, kernelRows, kernelCols,
stridePlanes, strideRows, strideCols,
padding_type)
.reshape(pre_contract_dims)
.contract(kernel.reshape(kernel_dims), contract_dims)
.reshape(post_contract_dims));
}
} // end namespace Eigen
#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
#if not defined(__CUDACC__)
#include <type_traits>
#endif
namespace Eigen {
namespace internal {
/** Extract3DPatches
* \ingroup CXX11_NeuralNetworksModule
*
* \brief Extracts 3D patches from a multichannel input volume.
*
* The input parameter is expected to be a tensor with a rank of 4 or more
* (channels, depth, height, width, optional others in col-major, and the
* reverse order in row-major).
* The return value will be a tensor of 3 more dimension than the input tensor.
* In col-major, the first 4 dimensions of the result are: channels, patch_depth,
* patch_height, patch_width. The next dimensions will identify the patch
* position on the 3D grid of extracted patches: z, y, x. The remaining
* dimensions, if any, will be the same as the 'other' dimensions of the input
* tensor.
*/
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorStridingOp<
const array<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions + 3>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions + 3>,
const TensorPatchOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions>,
const TensorPaddingOp<
const array<IndexPair<typename internal::traits<Input>::Index>,
internal::traits<Input>::NumDimensions>,
const Input> > > >
Extract3DPatches(
const Input& input, const DenseIndex patchPlanes,
const DenseIndex patchRows, const DenseIndex patchCols,
const DenseIndex stridePlanes, const DenseIndex strideRows,
const DenseIndex strideCols,
const DenseIndex paddingZTop, const DenseIndex paddingZBottom,
const DenseIndex paddingTop, const DenseIndex paddingBottom,
const DenseIndex paddingLeft, const DenseIndex paddingRight,
const typename internal::traits<Input>::Scalar padding_value = 0) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
static const int ExtDims = NumDims + 3;
// Tensor size after patch extraction. We add three dimensions to unpack the
// linear patch index into a 3D grid over which stride() can work.
DSizes<TensorIndex, ExtDims> pre_stride_dims;
if (isColMajor) {
pre_stride_dims[0] = in.dimension(0);
pre_stride_dims[1] = patchPlanes;
pre_stride_dims[2] = patchRows;
pre_stride_dims[3] = patchCols;
} else {
pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1);
pre_stride_dims[ExtDims - 4] = patchCols;
pre_stride_dims[ExtDims - 3] = patchRows;
pre_stride_dims[ExtDims - 2] = patchPlanes;
}
const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
array<IndexPair<TensorIndex>, NumDims> paddings;
for (int i = 0; i < NumDims; ++i) {
paddings[i] = IndexPair<TensorIndex>(0, 0);
}
paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair<TensorIndex>(paddingZTop, paddingZBottom);
paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair<TensorIndex>(paddingTop, paddingBottom);
paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair<TensorIndex>(paddingLeft, paddingRight);
pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1;
pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1;
pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1;
if (isColMajor) {
for (int i = 7; i < NumDims + 3; ++i) {
pre_stride_dims[i] = in.dimension(i - 3);
}
} else {
for (int i = 0; i < NumDims - 4; ++i) {
pre_stride_dims[i] = in.dimension(i);
}
}
DSizes<TensorIndex, NumDims> patch_dims;
if (isColMajor) {
patch_dims[0] = in.dimension(0);
patch_dims[1] = patchPlanes;
patch_dims[2] = patchRows;
patch_dims[3] = patchCols;
for (int i = 4; i < NumDims; ++i) {
patch_dims[i] = 1;
}
} else {
patch_dims[NumDims - 1] = in.dimension(NumDims - 1);
patch_dims[NumDims - 4] = patchCols;
patch_dims[NumDims - 3] = patchRows;
patch_dims[NumDims - 2] = patchPlanes;
for (int i = 0; i < NumDims - 4; i++) {
patch_dims[i] = 1;
}
}
array<TensorIndex, NumDims + 3> strides;
if (isColMajor) {
// No striding within the patches.
for (int i = 0; i < 4; ++i) {
strides[i] = 1;
}
// Apply striding in the spatial patch grid dimensions only.
strides[4] = stridePlanes;
strides[5] = strideRows;
strides[6] = strideCols;
// No striding in the remaining dimensions (batches, ...).
for (int i = 7; i < NumDims + 3; i++) {
strides[i] = 1;
}
} else {
// No striding within the patches.
for (int i = 1; i <= 4; ++i) {
strides[ExtDims - i] = 1;
}
// Apply striding in the spatial patch grid dimensions only.
strides[ExtDims - 7] = strideCols;
strides[ExtDims - 6] = strideRows;
strides[ExtDims - 5] = stridePlanes;
// No striding in the remaining dimensions (batches, ...).
for (int i = 0; i < NumDims - 4; i++) {
strides[i] = 1;
}
}
// TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
// extract_patches to take additional parameters for padding/striding,
// similarly to etract_image_patches.
return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
}
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorStridingOp<
const array<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions + 3>,
const TensorReshapingOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions + 3>,
const TensorPatchOp<
const DSizes<typename internal::traits<Input>::Index,
internal::traits<Input>::NumDimensions>,
const TensorPaddingOp<
const array<IndexPair<typename internal::traits<Input>::Index>,
internal::traits<Input>::NumDimensions>,
const Input> > > >
Extract3DPatches(
const Input& input, const DenseIndex patchPlanes,
const DenseIndex patchRows, const DenseIndex patchCols,
const DenseIndex stridePlanes, const DenseIndex strideRows,
const DenseIndex strideCols, const PaddingType padding_type,
const typename internal::traits<Input>::Scalar padding_value = 0) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
switch (padding_type) {
case PADDING_VALID:
// No padding in any dimension.
return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
stridePlanes, strideRows, strideCols,
0, 0, 0, 0, 0, 0, padding_value);
case PADDING_SAME: {
// The side of the tensor before striding should be just the expected
// output times the stride.
const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)) * stridePlanes;
const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)) * strideRows;
const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)) * strideCols;
// The size of the patch space is going to be: padded_input_size - patch_size + 1.
// This has to match the expected size before striding (pre_stride_dims).
// The deltas below extend the input to the expected size.
const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes;
const TensorIndex dy = size_y + patchRows - 1 - inputRows;
const TensorIndex dx = size_x + patchCols - 1 - inputCols;
return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
stridePlanes, strideRows, strideCols,
dz - dz / 2, dz / 2,
dy - dy / 2, dy / 2,
dx - dx / 2, dx / 2,
padding_value);
}
default:
eigen_assert(false && "unexpected padding");
// unreachable code to avoid missing return warning.
return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
stridePlanes, strideRows, strideCols,
0, 0, 0, 0, 0, 0, padding_value);
}
}
// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11.
template <typename Input>
struct Extract3DPatchesType {
typedef const TensorStridingOp< const array<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
const TensorPatchOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
const TensorPaddingOp< const array< IndexPair<typename internal::traits<Input>::Index>, internal::traits<Input>::NumDimensions>,
const Input> > > > type;
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
#include "Patch3d.h"
namespace Eigen {
/** SpatialMaxPooling
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies a max-pooling over a multichannel input image.
*
* The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
*
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
*
* The order of the width and height dimensions can be swapped if needed.
*
*/
#if !defined(EIGEN_HAS_INDEX_LIST)
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
#else
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
#endif
SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
{
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int idxRows = isColMajor ? 1 : 2;
static const int idxCols = isColMajor ? 2 : 1;
// Molds the output of the reduction into the shape expected by the user.
// (assuming col-major):
// - 1st dim: channels
// - 2nd dim: output height
// - 3rd dim: output width
// - 4th dim and beyond: everything else including batch size
Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
post_reduce_dims[0] = in.dimension(0);
if (padding_type == PADDING_VALID) {
post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
} else {
post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
}
post_reduce_dims[3] = in.dimension(3);
#if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11
Eigen::array<int, 2> reduction_dims;
if (isColMajor) {
reduction_dims[0] = 1;
reduction_dims[1] = 2;
} else {
reduction_dims[0] = 2;
reduction_dims[1] = 3;
}
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
#endif
return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims);
}
/** CuboidMaxPooling
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies a max-pooling over a multichannel input volume.
*
* The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major).
*
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major).
*
* The order of the depth, width and height dimensions can be swapped if needed.
*
*/
#if !defined(EIGEN_HAS_INDEX_LIST)
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
const TensorReductionOp<
internal::MaxReducer<float>, const Eigen::array<int, 1>,
const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
#else
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
const TensorReductionOp<
internal::MaxReducer<float>,
const Eigen::IndexList<Eigen::type2index<1> >,
const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
#endif
CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
DenseIndex patchRows, DenseIndex patchCols,
DenseIndex stridePlanes, DenseIndex strideRows,
DenseIndex strideCols, const PaddingType padding_type) {
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
static const int idxPlanes = isColMajor ? 1 : 3;
static const int idxRows = 2;
static const int idxCols = isColMajor ? 3 : 1;
// Molds the output of the reduction into the shape expected by the used
// (assuming col-major):
// - 1st dim: channels
// - 2nd dim: output depth
// - 3rd dim: output height
// - 4th dim: output width
// - 5th dim and beyond: everything else including batch size
Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
post_reduce_dims[0] = in.dimension(0);
if (padding_type == PADDING_VALID) {
post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
} else {
post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
}
post_reduce_dims[4] = in.dimension(4);
Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
if (isColMajor) {
pre_reduce_dims[0] = post_reduce_dims[0];
pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
} else {
pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
pre_reduce_dims[2] = post_reduce_dims[4];
}
#if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11
Eigen::array<int, 1> reduction_dims;
reduction_dims[0] = 1;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
#endif
return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
stridePlanes, strideRows, strideCols,
padding_type, -Eigen::NumTraits<float>::highest())
.reshape(pre_reduce_dims)
.maximum(reduction_dims)
.reshape(post_reduce_dims);
}
/** SpatialAvgPooling
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies an average pooling over a multichannel input image.
*
* The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
*
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
*
* The order of the width and height dimensions can be swapped if needed.
*
*/
namespace internal {
template <typename T> struct AvgPoolMeanReducer
{
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
// We only support packet access for floats.
static const bool PacketAccess = internal::is_same<T, float>::value;
#else
static const bool PacketAccess = false;
#endif
static const bool IsStateful = true;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
typedef typename packet_traits<T>::type Packet;
packetCount_ = pset1<Packet>(0.0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
if (t != -Eigen::NumTraits<T>::highest()) {
(*accum) = (*accum) + t;
scalarCount_++;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return static_cast<T>(0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
eigen_assert(scalarCount_ > 0);
return accum / scalarCount_;
}
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
#ifdef EIGEN_VECTORIZE_AVX
#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ)
#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask)
#else
#define pequal(a,b) _mm_cmpeq_ps(a,b)
#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b))
#endif
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
reducePacketWithType(static_cast<T>(0), p, accum);
}
template <typename Packet>
void reducePacketWithType(T, const Packet& p, Packet* accum) {
Packet skip_mask = pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
(*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
packetCount_ = padd<Packet>(packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(0);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return pdiv(vaccum, packetCount_);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_));
}
#endif
protected:
typedef typename packet_traits<T>::type Packet;
int scalarCount_;
Packet packetCount_;
};
} // namespace internal
#if !defined(EIGEN_HAS_INDEX_LIST)
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
#else
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
#endif
SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
{
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int idxRows = isColMajor ? 1 : 2;
static const int idxCols = isColMajor ? 2 : 1;
// Molds the output of the reduction into the shape expected by the user.
// (assuming col-major):
// - 1st dim: channels
// - 2nd dim: output height
// - 3rd dim: output width
// - 4th dim and beyond: everything else including batch size
Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
post_reduce_dims[0] = in.dimension(0);
if (padding_type == PADDING_VALID) {
post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
} else {
post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
}
post_reduce_dims[3] = in.dimension(3);
typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
#if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11
Eigen::array<int, 2> reduction_dims;
if (isColMajor) {
reduction_dims[0] = 1;
reduction_dims[1] = 2;
} else {
reduction_dims[0] = 2;
reduction_dims[1] = 3;
}
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
#endif
return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims);
}
/** CuboidAvgPooling
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies an average pooling over a multichannel input volume.
*
* The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major).
*
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major).
*
* The order of the depth, width and height dimensions can be swapped if needed.
*
*/
#if !defined(EIGEN_HAS_INDEX_LIST)
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
const TensorReductionOp<
internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>,
const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
#else
template <typename Input>
EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
const TensorReductionOp<
internal::AvgPoolMeanReducer<float>,
const Eigen::IndexList<Eigen::type2index<1> >,
const TensorReshapingOp<
const Eigen::DSizes<DenseIndex, 3>,
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
#endif
CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
DenseIndex patchRows, DenseIndex patchCols,
DenseIndex stridePlanes, DenseIndex strideRows,
DenseIndex strideCols, const PaddingType padding_type) {
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
static const int idxPlanes = isColMajor ? 1 : 3;
static const int idxRows = 2;
static const int idxCols = isColMajor ? 3 : 1;
// Molds the output of the reduction into the shape expected by the used
// (assuming col-major):
// - 1st dim: channels
// - 2nd dim: outupt depth
// - 3rd dim: output height
// - 4th dim: output width
// - 5th dim and beyond: everything else including batch size
Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
post_reduce_dims[0] = in.dimension(0);
if (padding_type == PADDING_VALID) {
post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
} else {
post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
}
post_reduce_dims[4] = in.dimension(4);
Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
if (isColMajor) {
pre_reduce_dims[0] = post_reduce_dims[0];
pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
} else {
pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
pre_reduce_dims[2] = post_reduce_dims[4];
}
typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
#if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11
Eigen::array<int, 1> reduction_dims;
reduction_dims[0] = 1;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
#endif
return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
stridePlanes, strideRows, strideCols,
padding_type, -Eigen::NumTraits<float>::highest())
.reshape(pre_reduce_dims)
.reduce(reduction_dims, mean_with_nan)
.reshape(post_reduce_dims);
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
namespace Eigen {
/** SoftMax
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies a softmax
*
* The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
*
* The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
*
*/
namespace {
class SoftmaxOp {
public:
EIGEN_ALWAYS_INLINE SoftmaxOp(const float beta) : beta_(beta) { }
template <typename Input> EIGEN_ALWAYS_INLINE
typename Input::Dimensions dimensions(const Input& input) const {
return input.dimensions();
}
template <typename Input, typename Output, typename Device>
void eval(const Input& input, Output& output, const Device& device) const
{
#if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11
Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
depth_dim[0] = 0;
Eigen::array<typename internal::traits<Input>::Index, 2> bcast;
bcast[0] = dimensions(input)[0];
bcast[1] = 1;
DSizes<typename internal::traits<Input>::Index, 2> dims2d;
dims2d[0] = 1;
dims2d[1] = dimensions(input)[1];
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<0>> depth_dim;
Eigen::IndexList<int, Eigen::type2index<1>> bcast;
bcast.set(0, dimensions(input)[0]);
Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
dims2d.set(1, dimensions(input)[1]);
#endif
output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
}
private:
const float beta_;
};
}
template <typename Input>
EIGEN_ALWAYS_INLINE
static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
SoftMax(const Input& input, const float beta)
{
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
const SoftmaxOp op(beta);
return input.customOp(op);
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
namespace Eigen {
namespace internal {
// These optimizations require vector instructions
#ifdef EIGEN_VECTORIZE
// TODO: Consolidate this part of the code with the image patch extraction code
// since they are both very similar.
template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
typename Scalar_, typename Index,
typename nocontract_t, typename contract_t,
int Side, size_t packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
{
public:
typedef TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
typedef SubMapper LinearMapper;
typedef Scalar_ Scalar;
typedef typename packet_traits<Scalar>::type Packet;
TensorContractionInputMapper(const TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>& tensor,
const nocontract_t&, const nocontract_t&,
const contract_t&, const contract_t&)
: m_impl(tensor.impl().impl())
{
Index patch_rows;
Index patch_depth;
if (internal::traits<ArgType>::Layout == ColMajor) {
patch_depth = tensor.impl().dimensions()[0];
patch_rows = tensor.impl().dimensions()[1];
m_patch_cols = tensor.impl().dimensions()[2];
m_num_patches = tensor.impl().dimensions()[3];
} else {
static const int NumDims = tensor.impl().dimensions().size();
patch_depth = tensor.impl().dimensions()[NumDims - 1];
patch_rows = tensor.impl().dimensions()[NumDims - 2];
m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
m_num_patches = tensor.impl().dimensions()[NumDims - 4];
}
m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
m_patch_col_inflate_strides = tensor.impl().colInflateStride();
m_colStride = patch_rows;
m_outputRows = tensor.impl().outputRows();
m_row_strides = tensor.impl().userRowStride();
m_col_strides = tensor.impl().userColStride();
m_in_row_strides = tensor.impl().userInRowStride();
m_in_col_strides = tensor.impl().userInColStride();
if (internal::traits<ArgType>::Layout == ColMajor) {
m_inputRows = tensor.impl().impl().dimensions()[1];
m_inputCols = tensor.impl().impl().dimensions()[2];
} else {
static const int NumDims = tensor.impl().impl().dimensions().size();
m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
}
m_rowInputStride = patch_depth;
m_colInputStride = patch_depth * m_inputRows;
m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
m_rowPaddingTop = tensor.impl().rowPaddingTop();
m_colPaddingLeft = tensor.impl().colPaddingLeft();
m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
}
TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper) :
m_impl(base_mapper.m_impl) {
m_patch_cols = base_mapper.m_patch_cols;
m_num_patches = base_mapper.m_num_patches;
m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
m_colStride = base_mapper.m_colStride;
m_rowInputStride = base_mapper.m_rowInputStride;
m_colInputStride = base_mapper.m_colInputStride;
m_patchInputStride = base_mapper.m_patchInputStride;
m_inputRows = base_mapper.m_inputRows;
m_inputCols = base_mapper.m_inputCols;
m_outputRows = base_mapper.m_outputRows;
m_row_strides = base_mapper.m_row_strides;
m_col_strides = base_mapper.m_col_strides;
m_in_row_strides = base_mapper.m_in_row_strides;
m_in_col_strides = base_mapper.m_in_col_strides;
m_rowPaddingTop = base_mapper.m_rowPaddingTop;
m_colPaddingLeft = base_mapper.m_colPaddingLeft;
m_fastInputRowStride = base_mapper.m_fastInputRowStride;
m_fastInputColStride = base_mapper.m_fastInputColStride;
m_fastNumPatches = base_mapper.m_fastNumPatches;
m_fastColStride = base_mapper.m_fastColStride;
m_fastOutputRows = base_mapper.m_fastOutputRows;
m_fastDimZero = base_mapper.m_fastDimZero;
}
// If true, turns off some optimizations for loading packets since the image
// patches are "non-standard" such as there are non-trivial strides or
// inflations in the input.
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
return SubMapper(*this, i, j);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(*this, i, j);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
Index rowIndex, colIndex, otherIndex;
computeBaseIndices(0, rowIndex, colIndex, otherIndex);
return loadCoeff(row, rowIndex, colIndex, otherIndex);
}
// Load the coefficient at the patchIndex location instead of the usual m_rowIndex,
// m_colIndex, m_otherIndex. This is currently only used by the gpu code. EIGEN_DEVICE_FUNC
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
Index rowIndex, colIndex, otherIndex;
computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
return loadCoeff(row, rowIndex, colIndex, otherIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
Index rowIndex, colIndex, otherIndex;
computeBaseIndices(0, rowIndex, colIndex, otherIndex);
return loadPacket(row, rowIndex, colIndex, otherIndex);
}
// Load the packet at the patchIndex location instead of the usual m_rowIndex,
// m_colIndex, m_otherIndex. This is currently only used by the gpu code.
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
Index rowIndex, colIndex, otherIndex;
computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
return loadPacket(row, rowIndex, colIndex, otherIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
const Index inputIndex = depth + baseIndex;
return m_impl.template packet<Unaligned>(inputIndex);
}
private:
friend class TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
// Find the offset of the element wrt the location of the first element.
const Index patchOffset = patchId / m_fastDimZero;
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex + colOffset * m_in_col_strides;
const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
const Index rowOffset = patchOffset - colOffset * m_colStride;
const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows |
(inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) {
return Scalar(0);
}
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
return m_impl.coeff(inputIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
eigen_assert(!nonStandardPatches());
// Find the offset of the element wrt the location of the first element.
const Index patchOffset = patchId / m_fastDimZero;
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex + colOffset;
const Index rowOffset = patchOffset - colOffset * m_colStride;
const Index inputRow = rowIndex + rowOffset;
if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) {
return Scalar(0);
}
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
return m_impl.coeff(inputIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
const Index packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
if (nonStandardPatches()) {
return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
}
return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
const Index packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
eigen_assert(!nonStandardPatches());
if ((patchDepth() % packetSize) == 0) {
return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
}
else {
const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
if (inputCols[0] >= m_inputCols | inputCols[1] < 0) {
// all zeros
return internal::pset1<Packet>(Scalar(0));
}
if (inputCols[0] == inputCols[1]) {
const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
if (inputRows[0] >= m_inputRows | inputRows[1] < 0) {
// all zeros
return internal::pset1<Packet>(Scalar(0));
}
if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) {
// no padding
const Index depth = patchId - patchOffsets[0] * patchDepth();
const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
return m_impl.template packet<Unaligned>(inputIndex);
}
}
}
return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
const Index packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
eigen_assert(!nonStandardPatches());
eigen_assert((patchDepth() % packetSize) == 0);
// Find the offset of the element wrt the location of the first element.
const Index patchOffset = patchId / m_fastDimZero;
eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex + colOffset;
const Index rowOffset = patchOffset - colOffset*m_colStride;
const Index inputRow = rowIndex + rowOffset;
if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) {
// all zeros
return internal::pset1<Packet>(Scalar(0));
}
// no padding
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
return m_impl.template packet<Unaligned>(inputIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
{
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX typename internal::remove_const<Scalar>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex);
}
Packet rslt = internal::pload<Packet>(values);
return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const {
const int NumInputDims = array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
otherIndex *= m_patchInputStride;
colIndex = patch2DIndex / m_fastOutputRows;
rowIndex = patch2DIndex - colIndex * m_outputRows;
colIndex = colIndex * m_col_strides - m_colPaddingLeft;
rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
}
Index m_patch_cols; // number of colums in the patch
Index m_num_patches; // number of patches to extract.
Index m_patch_row_inflate_strides; // the strides for row inflation in the image patch
Index m_patch_col_inflate_strides; // the strides for col inflation in the image patch
// Fast representation of inflation strides.
internal::TensorIntDivisor<Index> m_fastInputRowStride;
internal::TensorIntDivisor<Index> m_fastInputColStride;
Index m_otherStride;
Index m_colStride;
internal::TensorIntDivisor<Index> m_fastNumPatches;
internal::TensorIntDivisor<Index> m_fastColStride;
Index m_rowInputStride; // row stride in the input tensor
Index m_colInputStride; // col stride in the input tensor
Index m_patchInputStride; // patch stride in the input tensor
Index m_inputRows; // Number of rows in the input tensor
Index m_inputCols; // Number of cols in the input tensor
Index m_outputRows; // Number of patch rows
Index m_row_strides; // User specified row stride
Index m_col_strides; // User specified col stride
Index m_in_row_strides; // User specified input row stride
Index m_in_col_strides; // User specified input col stride
Index m_rowPaddingTop; // Row padding
Index m_colPaddingLeft; // Column padding
internal::TensorIntDivisor<Index> m_fastOutputRows;
internal::TensorIntDivisor<Index> m_fastDimZero;
const TensorEvaluator<ArgType, Device> m_impl;
};
template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
typename Scalar_, typename Index,
typename nocontract_t, typename contract_t,
int Side, size_t packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
{
public:
typedef Scalar_ Scalar;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef Self LinearMapper;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
: m_base_mapper(base_mapper), m_depth_offset(vert_offset), m_col_offset(horiz_offset) {
m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self& base_mapper, Index vert_offset, Index horiz_offset)
: m_base_mapper(base_mapper.m_base_mapper), m_depth_offset(vert_offset+base_mapper.m_depth_offset), m_col_offset(horiz_offset+base_mapper.m_col_offset) {
m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
return m_base_mapper(i + m_depth_offset, j + m_col_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return m_base_mapper.template loadPacket(i + m_depth_offset, j + m_col_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const {
return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const {
return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
template <typename Packet>
EIGEN_DEVICE_FUNC bool aligned(Index) const {
return false;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
return m_base_mapper.nonStandardPatches();
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; }
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
const Index inputIndex = depth + baseIndex;
return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
const Index r = m_rowIndex + row;
return r < 0 | r >= m_base_mapper.m_inputRows;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
const Index c = m_colIndex + col;
return c < 0 | c >= m_base_mapper.m_inputCols;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
const Index r = m_rowIndex + row;
const Index c = m_colIndex + col;
return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index rowOffset() const {
const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
return patchOffset-colOffset*m_base_mapper.m_colStride;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index colOffset() const {
const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
return colOffset;
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Index depthOffset() const {
const Index patchOffset = m_depth_offset % m_base_mapper.patchDepth();
return patchOffset;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
}
private:
const ParentMapper& m_base_mapper; // that was a reference before
Index m_depth_offset; // First row in the input matrix
Index m_col_offset; // First col in the input matrix
Index m_rowIndex; // precomputed row index corresponding to the col offset
Index m_colIndex; // precomputed col index corresponding to the col offset
Index m_otherIndex; // precomputed other index corresponding to the col offset
};
template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
typename Scalar, typename Index,
typename nocontract_t, typename contract_t,
int Side, size_t packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
struct gemm_pack_rhs<Scalar, Index, TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> {
typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper DataMapper;
static inline Index ceil_div(Index a, Index b) {
return (a + b - 1) / b;
}
EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const {
eigen_assert(stride == 0);
eigen_assert(offset == 0);
EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
typedef typename DataMapper::LinearMapper LinearMapper;
typedef typename packet_traits<Scalar>::type Packet;
const Index packet_cols4 = (cols/4) * 4;
const Index peeled_k = (depth/packet_size) * packet_size;
const bool non_standard_patches = rhs.nonStandardPatches();
for(Index j2=0; j2<packet_cols4; j2+=4)
{
const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
Index k=0;
if((packet_size%4)==0 && !non_standard_patches)
{
const Index patch_depth = rhs.patchDepth();
if ((patch_depth % packet_size) == 0) {
const Index patch_cols = rhs.patchCols();
const Index patch_rows = rhs.patchRows();
const Index startCol = rhs.colOffset();
const Index max_cols = std::min<Index>(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols);
for (Index c = startCol; c < max_cols; ++c) {
eigen_assert(k < peeled_k);
const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
const Index max_rows = std::min<Index>(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows);
const bool pad_col0 = dm0.padCol(c);
const bool pad_col1 = dm1.padCol(c);
const bool pad_col2 = dm2.padCol(c);
const bool pad_col3 = dm3.padCol(c);
for (Index r = startRow; r < max_rows; ++r) {
eigen_assert(k < peeled_k);
const bool pad0 = pad_col0 || dm0.padRow(r);
const bool pad1 = pad_col1 || dm1.padRow(r);
const bool pad2 = pad_col2 || dm2.padRow(r);
const bool pad3 = pad_col3 || dm3.padRow(r);
const Index idx0 = dm0.baseIndex(r, c);
const Index idx1 = dm1.baseIndex(r, c);
const Index idx2 = dm2.baseIndex(r, c);
const Index idx3 = dm3.baseIndex(r, c);
const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
const Index max_depth = std::min<Index>(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth);
eigen_assert(max_depth % packet_size == 0);
for (Index d = startDepth; d < max_depth; d += packet_size) {
eigen_assert(k < peeled_k);
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = pad0 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx0);
kernel.packet[1] = pad1 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx1);
kernel.packet[2] = pad2 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx2);
kernel.packet[3] = pad3 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx3);
ptranspose(kernel);
pstoreu(block+0*packet_size, kernel.packet[0]);
pstoreu(block+1*packet_size, kernel.packet[1]);
pstoreu(block+2*packet_size, kernel.packet[2]);
pstoreu(block+3*packet_size, kernel.packet[3]);
block+=4*packet_size;
k += packet_size;
}
}
}
for(; k<peeled_k; k+=packet_size) {
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = dm0.loadPacketFast(k);
kernel.packet[1] = dm1.loadPacketFast(k);
kernel.packet[2] = dm2.loadPacketFast(k);
kernel.packet[3] = dm3.loadPacketFast(k);
ptranspose(kernel);
pstoreu(block+0*packet_size, kernel.packet[0]);
pstoreu(block+1*packet_size, kernel.packet[1]);
pstoreu(block+2*packet_size, kernel.packet[2]);
pstoreu(block+3*packet_size, kernel.packet[3]);
block+=4*packet_size;
}
}
else {
for(; k<peeled_k; k+=packet_size) {
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = dm0.loadPacketStandard(k);
kernel.packet[1] = dm1.loadPacketStandard(k);
kernel.packet[2] = dm2.loadPacketStandard(k);
kernel.packet[3] = dm3.loadPacketStandard(k);
ptranspose(kernel);
pstoreu(block+0*packet_size, kernel.packet[0]);
pstoreu(block+1*packet_size, kernel.packet[1]);
pstoreu(block+2*packet_size, kernel.packet[2]);
pstoreu(block+3*packet_size, kernel.packet[3]);
block+=4*packet_size;
}
}
}
if (!rhs.nonStandardPatches()) {
for(; k<depth; k++)
{
block[0] = dm0.loadCoeffStandard(k);
block[1] = dm1.loadCoeffStandard(k);
block[2] = dm2.loadCoeffStandard(k);
block[3] = dm3.loadCoeffStandard(k);
block += 4;
}
}
else {
for(; k<depth; k++)
{
block[0] = dm0(k);
block[1] = dm1(k);
block[2] = dm2(k);
block[3] = dm3(k);
block += 4;
}
}
}
// copy the remaining columns one at a time (nr==1)
for(Index j2=packet_cols4; j2<cols; ++j2)
{
const SubMapper dm0 = rhs.getLinearMapper(0, j2);
for(Index k=0; k<depth; k++)
{
*block = dm0(k);
block += 1;
}
}
}
};
#endif // EIGEN_VECTORIZE
} // end namespace internal
/** SpatialConvolution
* \ingroup CXX11_NeuralNetworks_Module
*
* \brief Applies a 2D convolution over a multichannel input image.
*
* The input parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
* The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
* The input and the kernel must both be in col-major layout. The result will also be in col-major layout.
*
* If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
*
* The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable).
*
* It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
*
*/
template <typename Input, typename Kernel>
EIGEN_ALWAYS_INLINE
static const typename internal::conditional<
internal::traits<Input>::Layout == ColMajor,
TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >,
TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel> > > >::type
SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) {
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
static const int NumDims = internal::traits<Input>::NumDimensions;
// Number of filters to apply. This is the same as the output depth of the result
const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
// Number of channels. This is the same as the input depth.
const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
array<IndexPair<TensorIndex>, 1> contract_dims;
contract_dims[0] = IndexPair<TensorIndex>(1, 0);
const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
TensorIndex out_height;
TensorIndex out_width;
switch (padding_type) {
case PADDING_VALID:
out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast<float>(stride));
out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast<float>(stride));
break;
case PADDING_SAME:
out_height = numext::ceil(InputRows / static_cast<float>(stride));
out_width = numext::ceil(InputCols / static_cast<float>(stride));
break;
default:
eigen_assert(false && "unexpected padding");
}
// Molds the output of the patch extraction code into a 2d tensor:
// - the first dimension (dims[0]): the patch values to be multiplied with the kernels
// - the second dimension (dims[1]): everything else
DSizes<TensorIndex, 2> pre_contract_dims;
if (isColMajor) {
pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
pre_contract_dims[1] = out_height * out_width;
for (int i = 3; i < NumDims; ++i) {
pre_contract_dims[1] *= in.dimension(i);
}
} else {
pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
pre_contract_dims[0] = out_height * out_width;
for (int i = 0; i < NumDims - 3; ++i) {
pre_contract_dims[0] *= in.dimension(i);
}
}
// Molds the output of the contraction into the shape expected by the used
// (assuming this is ColMajor):
// - 1st dim: kernel filters
// - 2nd dim: output height
// - 3rd dim: output width
// - 4th dim and beyond: everything else including batch size
DSizes<TensorIndex, NumDims> post_contract_dims;
if (isColMajor) {
post_contract_dims[0] = kernelFilters;
post_contract_dims[1] = out_height;
post_contract_dims[2] = out_width;
for (int i = 3; i < NumDims; ++i) {
post_contract_dims[i] = in.dimension(i);
}
} else {
post_contract_dims[NumDims - 1] = kernelFilters;
post_contract_dims[NumDims - 2] = out_height;
post_contract_dims[NumDims - 3] = out_width;
for (int i = 0; i < NumDims - 3; ++i) {
post_contract_dims[i] = in.dimension(i);
}
}
DSizes<TensorIndex, 2> kernel_dims;
if (isColMajor) {
kernel_dims[0] = kernelFilters;
kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
} else {
kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
kernel_dims[1] = kernelFilters;
}
// TODO(yangke): choose() is defined in TensorContraction.h -- consider
// moving it to somewhere more "common".
return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
}
} // end namespace Eigen
#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
namespace Eigen {
/** \class TensorConvolutionByFFT
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor convolution class.
*
*
*/
namespace internal {
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct traits<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename promote_storage_type<typename InputXprType::Scalar,
typename KernelXprType::Scalar>::ret Scalar;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
typename traits<KernelXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<InputXprType>::Index,
typename traits<KernelXprType>::Index>::type Index;
typedef typename InputXprType::Nested LhsNested;
typedef typename KernelXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<InputXprType>::NumDimensions;
static const int Layout = traits<InputXprType>::Layout;
enum {
Flags = 0,
};
};
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
{
typedef const TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>& type;
};
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct nested<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >::type>
{
typedef TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> type;
};
} // end namespace internal
template<typename Indices, typename InputXprType, typename KernelXprType>
class TensorConvolutionByFFTOp : public TensorBase<TensorConvolutionByFFTOp<Indices, InputXprType, KernelXprType> >
{
public:
typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
typename KernelXprType::PacketReturnType>::ret PacketReturnType;
typedef typename Eigen::internal::nested<TensorConvolutionByFFTOp>::type Nested;
typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
: m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Indices& indices() const { return m_indices; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<typename InputXprType::Nested>::type&
inputExpression() const { return m_input_xpr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<typename KernelXprType::Nested>::type&
kernelExpression() const { return m_kernel_xpr; }
protected:
typename InputXprType::Nested m_input_xpr;
typename KernelXprType::Nested m_kernel_xpr;
const Indices m_indices;
};
template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
struct TensorEvaluator<const TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType>, Device>
{
typedef TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType> XprType;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
static const int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
}
} else {
m_inputStride[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
}
}
m_dimensions = m_inputImpl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i > 0) {
m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
} else {
m_kernelStride[0] = 1;
}
m_indexStride[i] = m_inputStride[index];
}
m_outputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
}
} else {
for (int i = NumKernelDims - 1; i >= 0; --i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i < NumKernelDims - 1) {
m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
} else {
m_kernelStride[NumKernelDims - 1] = 1;
}
m_indexStride[i] = m_inputStride[index];
}
m_outputStride[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
m_inputImpl.evalSubExprsIfNeeded(NULL);
m_kernelImpl.evalSubExprsIfNeeded(NULL);
typedef typename internal::traits<InputArgType>::Index TensorIndex;
Tensor<Scalar, NumDims, Layout, TensorIndex> input(m_inputImpl.dimensions());
for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) {
input.data()[i] = m_inputImpl.coeff(i);
}
Tensor<Scalar, NumDims, Layout, TensorIndex> kernel(m_kernelImpl.dimensions());
for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) {
kernel.data()[i] = m_kernelImpl.coeff(i);
}
array<std::pair<ptrdiff_t, ptrdiff_t>, NumDims> paddings;
for (int i = 0; i < NumDims; ++i) {
paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]);
}
Eigen::array<bool, NumKernelDims> reverse;
for (int i = 0; i < NumKernelDims; ++i) {
reverse[i] = true;
}
Eigen::array<bool, NumDims> fft;
for (int i = 0; i < NumDims; ++i) {
fft[i] = i;
}
Eigen::DSizes<TensorIndex, NumDims> slice_offsets;
for (int i = 0; i < NumDims; ++i) {
slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1;
}
Eigen::DSizes<TensorIndex, NumDims> slice_extents;
for (int i = 0; i < NumDims; ++i) {
slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1;
}
Tensor<Scalar, NumDims, Layout, TensorIndex> kernel_variant = kernel.reverse(reverse).pad(paddings);
Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> kernel_fft = kernel_variant.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
//Tensor<std::complex<Scalar>, NumDims, Layout|IndexType> kernel_fft = kernel.reverse(reverse).pad(paddings).template fft<2>(fft);
Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> input_fft = input.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft<Eigen::BothParts, FFT_REVERSE>(fft);
Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents);
for (int i = 0; i < tensor_result.size(); ++i) {
data[i] = std::real(tensor_result.data()[i]);
}
return false;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_local_kernel) {
m_device.deallocate((void*)m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
void evalTo(typename XprType::Scalar* buffer) {
evalSubExprsIfNeeded(NULL);
for (int i = 0; i < dimensions().TotalSize(); ++i) {
buffer[i] += coeff(i);
}
cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
CoeffReturnType result = CoeffReturnType(0);
return result;
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
array<Index, NumDims> m_inputStride;
array<Index, NumDims> m_outputStride;
array<Index, NumKernelDims> m_indexStride;
array<Index, NumKernelDims> m_kernelStride;
TensorEvaluator<InputArgType, Device> m_inputImpl;
TensorEvaluator<KernelArgType, Device> m_kernelImpl;
Dimensions m_dimensions;
KernelArgType m_kernelArg;
const Scalar* m_kernel;
bool m_local_kernel;
const Device& m_device;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
#include "unsupported/Eigen/SpecialFunctions"
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册