Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
59a8ebc6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
59a8ebc6
编写于
8月 07, 2017
作者:
C
caoying03
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into kmax_score_layer
上级
98a83cd2
50fe7abe
变更
126
隐藏空白更改
内联
并排
Showing
126 changed file
with
4841 addition
and
3682 deletion
+4841
-3682
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
CMakeLists.txt
CMakeLists.txt
+2
-2
Dockerfile
Dockerfile
+4
-1
cmake/configure.cmake
cmake/configure.cmake
+0
-2
cmake/cpplint.cmake
cmake/cpplint.cmake
+10
-18
cmake/external/any.cmake
cmake/external/any.cmake
+1
-1
cmake/external/gflags.cmake
cmake/external/gflags.cmake
+8
-1
cmake/external/openblas.cmake
cmake/external/openblas.cmake
+7
-2
cmake/external/python.cmake
cmake/external/python.cmake
+0
-1
cmake/flags.cmake
cmake/flags.cmake
+2
-1
cmake/generic.cmake
cmake/generic.cmake
+13
-0
cmake/util.cmake
cmake/util.cmake
+8
-5
doc/design/mkldnn/README.MD
doc/design/mkldnn/README.MD
+110
-0
doc/design/mkldnn/image/overview.png
doc/design/mkldnn/image/overview.png
+0
-0
paddle/.set_python_path.sh
paddle/.set_python_path.sh
+12
-19
paddle/api/test/CMakeLists.txt
paddle/api/test/CMakeLists.txt
+6
-2
paddle/cuda/src/hl_batch_transpose.cu
paddle/cuda/src/hl_batch_transpose.cu
+7
-9
paddle/cuda/src/hl_cuda_aggregate.cu
paddle/cuda/src/hl_cuda_aggregate.cu
+61
-101
paddle/cuda/src/hl_cuda_cnn.cu
paddle/cuda/src/hl_cuda_cnn.cu
+275
-134
paddle/cuda/src/hl_cuda_lstm.cu
paddle/cuda/src/hl_cuda_lstm.cu
+331
-159
paddle/cuda/src/hl_cuda_matrix.cu
paddle/cuda/src/hl_cuda_matrix.cu
+147
-196
paddle/cuda/src/hl_cuda_sequence.cu
paddle/cuda/src/hl_cuda_sequence.cu
+96
-88
paddle/cuda/src/hl_cuda_sparse.cu
paddle/cuda/src/hl_cuda_sparse.cu
+475
-509
paddle/cuda/src/hl_perturbation_util.cu
paddle/cuda/src/hl_perturbation_util.cu
+104
-45
paddle/cuda/src/hl_table_apply.cu
paddle/cuda/src/hl_table_apply.cu
+35
-33
paddle/cuda/src/hl_top_k.cu
paddle/cuda/src/hl_top_k.cu
+127
-114
paddle/framework/attribute.proto
paddle/framework/attribute.proto
+7
-7
paddle/framework/op_desc.proto
paddle/framework/op_desc.proto
+17
-17
paddle/framework/op_proto.proto
paddle/framework/op_proto.proto
+72
-70
paddle/framework/operator.cc
paddle/framework/operator.cc
+2
-2
paddle/framework/operator.h
paddle/framework/operator.h
+9
-5
paddle/framework/operator_test.cc
paddle/framework/operator_test.cc
+4
-4
paddle/function/BlockExpandOpTest.cpp
paddle/function/BlockExpandOpTest.cpp
+8
-8
paddle/function/BufferArgTest.cpp
paddle/function/BufferArgTest.cpp
+1
-1
paddle/function/ContextProjectionOpGpu.cu
paddle/function/ContextProjectionOpGpu.cu
+70
-56
paddle/function/CosSimOpGpu.cu
paddle/function/CosSimOpGpu.cu
+34
-26
paddle/function/CropOpGpu.cu
paddle/function/CropOpGpu.cu
+59
-25
paddle/function/CrossMapNormalOpGpu.cu
paddle/function/CrossMapNormalOpGpu.cu
+46
-25
paddle/function/CrossMapNormalOpTest.cpp
paddle/function/CrossMapNormalOpTest.cpp
+10
-10
paddle/function/DepthwiseConvOpGpu.cu
paddle/function/DepthwiseConvOpGpu.cu
+253
-218
paddle/function/FunctionTest.cpp
paddle/function/FunctionTest.cpp
+6
-6
paddle/function/Im2ColOpGpu.cu
paddle/function/Im2ColOpGpu.cu
+150
-106
paddle/function/MulOpGpu.cu
paddle/function/MulOpGpu.cu
+1
-1
paddle/function/PadOpGpu.cu
paddle/function/PadOpGpu.cu
+49
-15
paddle/function/RowConvOpGpu.cu
paddle/function/RowConvOpGpu.cu
+87
-68
paddle/function/TensorShapeTest.cpp
paddle/function/TensorShapeTest.cpp
+12
-12
paddle/function/TensorTypeTest.cpp
paddle/function/TensorTypeTest.cpp
+7
-7
paddle/function/nnpack/NNPACKConvOp.cpp
paddle/function/nnpack/NNPACKConvOp.cpp
+53
-47
paddle/gserver/activations/ActivationFunction.cpp
paddle/gserver/activations/ActivationFunction.cpp
+7
-3
paddle/gserver/layers/ExpandConvLayer.cpp
paddle/gserver/layers/ExpandConvLayer.cpp
+1
-2
paddle/gserver/layers/GruCompute.cu
paddle/gserver/layers/GruCompute.cu
+4
-3
paddle/gserver/layers/KmaxSeqScoreLayer.cpp
paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+10
-4
paddle/gserver/layers/LstmCompute.cu
paddle/gserver/layers/LstmCompute.cu
+38
-17
paddle/gserver/layers/PrintLayer.cpp
paddle/gserver/layers/PrintLayer.cpp
+1
-1
paddle/gserver/tests/CMakeLists.txt
paddle/gserver/tests/CMakeLists.txt
+0
-5
paddle/gserver/tests/test_ActivationGrad.cpp
paddle/gserver/tests/test_ActivationGrad.cpp
+33
-0
paddle/math/BaseMatrix.cu
paddle/math/BaseMatrix.cu
+619
-366
paddle/math/TrainingAlgorithmOp.cu
paddle/math/TrainingAlgorithmOp.cu
+32
-33
paddle/math/tests/test_Tensor.cu
paddle/math/tests/test_Tensor.cu
+167
-170
paddle/math/tests/test_lazyAssign.cu
paddle/math/tests/test_lazyAssign.cu
+40
-34
paddle/math/tests/test_matrixCompare.cpp
paddle/math/tests/test_matrixCompare.cpp
+1
-1
paddle/operators/.clang-format
paddle/operators/.clang-format
+5
-0
paddle/operators/CMakeLists.txt
paddle/operators/CMakeLists.txt
+2
-1
paddle/operators/add_op.cc
paddle/operators/add_op.cc
+5
-5
paddle/operators/add_op.h
paddle/operators/add_op.h
+1
-1
paddle/operators/cross_entropy_op.cc
paddle/operators/cross_entropy_op.cc
+18
-4
paddle/operators/cross_entropy_op.h
paddle/operators/cross_entropy_op.h
+34
-9
paddle/operators/fc_op.cc
paddle/operators/fc_op.cc
+6
-8
paddle/operators/fill_zeros_like_op.cc
paddle/operators/fill_zeros_like_op.cc
+3
-4
paddle/operators/fill_zeros_like_op.h
paddle/operators/fill_zeros_like_op.h
+1
-1
paddle/operators/mean_op.cc
paddle/operators/mean_op.cc
+3
-3
paddle/operators/mean_op.h
paddle/operators/mean_op.h
+2
-2
paddle/operators/mul_op.cc
paddle/operators/mul_op.cc
+12
-8
paddle/operators/mul_op.h
paddle/operators/mul_op.h
+1
-1
paddle/operators/net_op.h
paddle/operators/net_op.h
+2
-2
paddle/operators/net_op_test.cc
paddle/operators/net_op_test.cc
+2
-2
paddle/operators/recurrent_op.cc
paddle/operators/recurrent_op.cc
+56
-208
paddle/operators/recurrent_op.h
paddle/operators/recurrent_op.h
+11
-82
paddle/operators/recurrent_op_test.cc
paddle/operators/recurrent_op_test.cc
+10
-13
paddle/operators/rnn/recurrent_op_utils.cc
paddle/operators/rnn/recurrent_op_utils.cc
+160
-0
paddle/operators/rnn/recurrent_op_utils.h
paddle/operators/rnn/recurrent_op_utils.h
+93
-0
paddle/operators/rowwise_add_op.cc
paddle/operators/rowwise_add_op.cc
+2
-2
paddle/operators/rowwise_add_op.h
paddle/operators/rowwise_add_op.h
+1
-1
paddle/operators/sgd_op.cc
paddle/operators/sgd_op.cc
+2
-2
paddle/operators/sgd_op.h
paddle/operators/sgd_op.h
+1
-1
paddle/operators/sigmoid_op.cc
paddle/operators/sigmoid_op.cc
+7
-7
paddle/operators/sigmoid_op.cu
paddle/operators/sigmoid_op.cu
+2
-0
paddle/operators/sigmoid_op.h
paddle/operators/sigmoid_op.h
+20
-1
paddle/operators/softmax_op.cc
paddle/operators/softmax_op.cc
+3
-3
paddle/operators/softmax_op.h
paddle/operators/softmax_op.h
+2
-2
paddle/operators/type_alias.h
paddle/operators/type_alias.h
+4
-9
paddle/scripts/docker/build.sh
paddle/scripts/docker/build.sh
+32
-15
paddle/scripts/run_python_tests.sh
paddle/scripts/run_python_tests.sh
+0
-55
paddle/setup.py.in
paddle/setup.py.in
+3
-1
paddle/trainer/tests/compare_sparse_data
paddle/trainer/tests/compare_sparse_data
+0
-0
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
...vider_wrapper_dir/test_pydata_provider_wrapper.proto_data
+0
-0
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
...ovider_wrapper_dir/test_pydata_provider_wrapper.protolist
+1
-1
paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
...e/trainer/tests/sample_trainer_config_compare_sparse.conf
+154
-0
paddle/trainer/tests/test_CompareSparse.cpp
paddle/trainer/tests/test_CompareSparse.cpp
+1
-1
paddle/trainer/tests/train_sparse.list
paddle/trainer/tests/train_sparse.list
+1
-0
proto/DataConfig.proto
proto/DataConfig.proto
+27
-26
proto/DataFormat.proto
proto/DataFormat.proto
+22
-16
proto/ModelConfig.proto
proto/ModelConfig.proto
+57
-57
proto/OptimizerConfig.proto
proto/OptimizerConfig.proto
+36
-36
proto/ParameterConfig.proto
proto/ParameterConfig.proto
+23
-22
proto/ParameterServerConfig.proto
proto/ParameterServerConfig.proto
+10
-13
proto/ParameterService.proto
proto/ParameterService.proto
+37
-64
proto/TrainerConfig.proto
proto/TrainerConfig.proto
+43
-39
python/paddle/v2/framework/create_op_creation_methods.py
python/paddle/v2/framework/create_op_creation_methods.py
+15
-15
python/paddle/v2/framework/tests/CMakeLists.txt
python/paddle/v2/framework/tests/CMakeLists.txt
+23
-17
python/paddle/v2/framework/tests/op_test_util.py
python/paddle/v2/framework/tests/op_test_util.py
+14
-12
python/paddle/v2/framework/tests/test_add_two_op.py
python/paddle/v2/framework/tests/test_add_two_op.py
+5
-3
python/paddle/v2/framework/tests/test_cross_entropy_op.py
python/paddle/v2/framework/tests/test_cross_entropy_op.py
+8
-4
python/paddle/v2/framework/tests/test_mean_op.py
python/paddle/v2/framework/tests/test_mean_op.py
+2
-2
python/paddle/v2/framework/tests/test_mul_op.py
python/paddle/v2/framework/tests/test_mul_op.py
+5
-3
python/paddle/v2/framework/tests/test_op_creation_methods.py
python/paddle/v2/framework/tests/test_op_creation_methods.py
+17
-17
python/paddle/v2/framework/tests/test_protobuf.py
python/paddle/v2/framework/tests/test_protobuf.py
+2
-4
python/paddle/v2/framework/tests/test_recurrent_op.py
python/paddle/v2/framework/tests/test_recurrent_op.py
+48
-42
python/paddle/v2/framework/tests/test_rowwise_add_op.py
python/paddle/v2/framework/tests/test_rowwise_add_op.py
+5
-3
python/paddle/v2/framework/tests/test_sgd_op.py
python/paddle/v2/framework/tests/test_sgd_op.py
+7
-4
python/paddle/v2/framework/tests/test_sigmoid_op.py
python/paddle/v2/framework/tests/test_sigmoid_op.py
+5
-2
python/paddle/v2/framework/tests/test_softmax_op.py
python/paddle/v2/framework/tests/test_softmax_op.py
+4
-2
python/paddle/v2/plot/tests/CMakeLists.txt
python/paddle/v2/plot/tests/CMakeLists.txt
+1
-1
python/paddle/v2/reader/tests/CMakeLists.txt
python/paddle/v2/reader/tests/CMakeLists.txt
+2
-1
python/paddle/v2/tests/CMakeLists.txt
python/paddle/v2/tests/CMakeLists.txt
+7
-2
python/setup.py.in
python/setup.py.in
+1
-1
未找到文件。
.pre-commit-config.yaml
浏览文件 @
59a8ebc6
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
description
:
Format files with ClangFormat.
description
:
Format files with ClangFormat.
entry
:
clang-format -i
entry
:
clang-format -i
language
:
system
language
:
system
files
:
\.(c|cc|cxx|cpp|
h|hpp|hxx
)$
files
:
\.(c|cc|cxx|cpp|
cu|h|hpp|hxx|proto
)$
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
hooks
:
hooks
:
...
...
CMakeLists.txt
浏览文件 @
59a8ebc6
...
@@ -36,8 +36,8 @@ include(simd)
...
@@ -36,8 +36,8 @@ include(simd)
################################ Configurations #######################################
################################ Configurations #######################################
option
(
WITH_GPU
"Compile PaddlePaddle with NVIDIA GPU"
${
CUDA_FOUND
}
)
option
(
WITH_GPU
"Compile PaddlePaddle with NVIDIA GPU"
${
CUDA_FOUND
}
)
option
(
WITH_AVX
"Compile PaddlePaddle with AVX intrinsics"
${
AVX_FOUND
}
)
option
(
WITH_AVX
"Compile PaddlePaddle with AVX intrinsics"
${
AVX_FOUND
}
)
option
(
WITH_MKLDNN
"Compile PaddlePaddle with mkl-dnn support."
OFF
)
option
(
WITH_MKLDNN
"Compile PaddlePaddle with mkl-dnn support."
${
AVX_FOUND
}
)
option
(
WITH_MKLML
"Compile PaddlePaddle with mklml package."
OFF
)
option
(
WITH_MKLML
"Compile PaddlePaddle with mklml package."
${
AVX_FOUND
}
)
option
(
WITH_DSO
"Compile PaddlePaddle with dynamic linked CUDA"
ON
)
option
(
WITH_DSO
"Compile PaddlePaddle with dynamic linked CUDA"
ON
)
option
(
WITH_TESTING
"Compile PaddlePaddle with unit testing"
ON
)
option
(
WITH_TESTING
"Compile PaddlePaddle with unit testing"
ON
)
option
(
WITH_SWIG_PY
"Compile PaddlePaddle with inference api"
ON
)
option
(
WITH_SWIG_PY
"Compile PaddlePaddle with inference api"
ON
)
...
...
Dockerfile
浏览文件 @
59a8ebc6
...
@@ -27,13 +27,16 @@ RUN apt-get update && \
...
@@ -27,13 +27,16 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison
\
git python-pip python-dev openssh-server bison
\
wget unzip unrar
tar
xz-utils bzip2
gzip
coreutils ntp
\
wget unzip unrar
tar
xz-utils bzip2
gzip
coreutils ntp
\
curl
sed grep
graphviz libjpeg-dev zlib1g-dev
\
curl
sed grep
graphviz libjpeg-dev zlib1g-dev
\
python-
numpy python-
matplotlib gcc-4.8 g++-4.8
\
python-matplotlib gcc-4.8 g++-4.8
\
automake locales clang-format-3.8 swig doxygen cmake
\
automake locales clang-format-3.8 swig doxygen cmake
\
liblapack-dev liblapacke-dev libboost-dev
\
liblapack-dev liblapacke-dev libboost-dev
\
clang-3.8 llvm-3.8 libclang-3.8-dev
\
clang-3.8 llvm-3.8 libclang-3.8-dev
\
net-tools
&&
\
net-tools
&&
\
apt-get clean
-y
apt-get clean
-y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN
pip
--no-cache-dir
install
'numpy>=1.12.0'
# Install Go and glide
# Install Go and glide
RUN
wget
-O
go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz
&&
\
RUN
wget
-O
go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz
&&
\
tar
-C
/usr/local
-xzf
go.tgz
&&
\
tar
-C
/usr/local
-xzf
go.tgz
&&
\
...
...
cmake/configure.cmake
浏览文件 @
59a8ebc6
...
@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
...
@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
set
(
OPENMP_FLAGS
"-fopenmp"
)
set
(
OPENMP_FLAGS
"-fopenmp"
)
set
(
CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS
${
OPENMP_FLAGS
}
)
set
(
CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS
${
OPENMP_FLAGS
}
)
set
(
CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS
${
OPENMP_FLAGS
}
)
set
(
CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS
${
OPENMP_FLAGS
}
)
set
(
CMAKE_SHARED_LINKER_FLAGS
"
${
CMAKE_SHARED_LINKER_FLAGS
}
-L
${
MKLDNN_IOMP_DIR
}
-liomp5 -Wl,--as-needed"
)
set
(
CMAKE_EXE_LINKER_FLAGS
"
${
CMAKE_EXE_LINKER_FLAGS
}
-L
${
MKLDNN_IOMP_DIR
}
-liomp5 -Wl,--as-needed"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
OPENMP_FLAGS
}
"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
OPENMP_FLAGS
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
OPENMP_FLAGS
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
OPENMP_FLAGS
}
"
)
else
()
else
()
...
...
cmake/cpplint.cmake
浏览文件 @
59a8ebc6
...
@@ -42,29 +42,21 @@ macro(add_style_check_target TARGET_NAME)
...
@@ -42,29 +42,21 @@ macro(add_style_check_target TARGET_NAME)
if
(
WITH_STYLE_CHECK
)
if
(
WITH_STYLE_CHECK
)
set
(
SOURCES_LIST
${
ARGN
}
)
set
(
SOURCES_LIST
${
ARGN
}
)
list
(
REMOVE_DUPLICATES SOURCES_LIST
)
list
(
REMOVE_DUPLICATES SOURCES_LIST
)
list
(
SORT SOURCES_LIST
)
foreach
(
filename
${
SOURCES_LIST
}
)
foreach
(
filename
${
SOURCES_LIST
}
)
set
(
LINT ON
)
foreach
(
pattern
${
IGNORE_PATTERN
}
)
foreach
(
pattern
${
IGNORE_PATTERN
}
)
if
(
filename MATCHES
${
pattern
}
)
if
(
filename MATCHES
${
pattern
}
)
message
(
STATUS
"DROP LINT
${
filename
}
"
)
list
(
REMOVE_ITEM SOURCES_LIST
${
filename
}
)
set
(
LINT OFF
)
endif
()
endif
()
endforeach
()
endforeach
()
if
(
LINT MATCHES ON
)
# cpplint code style
get_filename_component
(
base_filename
${
filename
}
NAME
)
set
(
CUR_GEN
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
base_filename
}
.cpplint
)
add_custom_command
(
OUTPUT
${
CUR_GEN
}
PRE_BUILD
COMMAND
"
${
PYTHON_EXECUTABLE
}
"
"
${
PROJ_ROOT
}
/paddle/scripts/cpplint.py"
"--filter=
${
STYLE_FILTER
}
"
"--write-success=
${
CUR_GEN
}
"
${
filename
}
DEPENDS
${
filename
}
${
PROJ_ROOT
}
/paddle/scripts/cpplint.py
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
add_custom_target
(
${
base_filename
}
.cpplint DEPENDS
${
CUR_GEN
}
)
add_dependencies
(
${
TARGET_NAME
}
${
base_filename
}
.cpplint
)
endif
()
endforeach
()
endforeach
()
if
(
SOURCES_LIST
)
add_custom_command
(
TARGET
${
TARGET_NAME
}
POST_BUILD
COMMAND
"
${
PYTHON_EXECUTABLE
}
"
"
${
PROJ_ROOT
}
/paddle/scripts/cpplint.py"
"--filter=
${
STYLE_FILTER
}
"
${
SOURCES_LIST
}
COMMENT
"cpplint: Checking source code style"
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
endif
()
endif
()
endif
()
endmacro
()
endmacro
()
cmake/external/any.cmake
浏览文件 @
59a8ebc6
...
@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
...
@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add
(
ExternalProject_Add
(
extern_lib_any
extern_lib_any
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/
thelink2012
/any.git"
GIT_REPOSITORY
"https://github.com/
PaddlePaddle
/any.git"
GIT_TAG
"8fef1e93710a0edf8d7658999e284a1142c4c020"
GIT_TAG
"8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX
${
ANY_SOURCE_DIR
}
PREFIX
${
ANY_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
...
...
cmake/external/gflags.cmake
浏览文件 @
59a8ebc6
...
@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
...
@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add
(
ExternalProject_Add
(
extern_gflags
extern_gflags
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/gflags/gflags.git"
# TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY
"https://github.com/wangkuiyi/gflags.git"
PREFIX
${
GFLAGS_SOURCES_DIR
}
PREFIX
${
GFLAGS_SOURCES_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
cmake/external/openblas.cmake
浏览文件 @
59a8ebc6
...
@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
...
@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
MESSAGE
(
STATUS
"BLAS library:
${
CBLAS_LIBRARIES
}
"
)
MESSAGE
(
STATUS
"BLAS library:
${
CBLAS_LIBRARIES
}
"
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC_DIR
}
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC_DIR
}
)
ADD_LIBRARY
(
cblas STATIC IMPORTED
)
# FIXME(gangliao): generate cblas target to track all high performance
SET_PROPERTY
(
TARGET cblas PROPERTY IMPORTED_LOCATION
${
CBLAS_LIBRARIES
}
)
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/cblas_dummy.c
)
FILE
(
WRITE
${
dummyfile
}
"const char * dummy =
\"
${
dummyfile
}
\"
;"
)
ADD_LIBRARY
(
cblas STATIC
${
dummyfile
}
)
TARGET_LINK_LIBRARIES
(
cblas
${
CBLAS_LIBRARIES
}
)
IF
(
NOT
${
CBLAS_FOUND
}
)
IF
(
NOT
${
CBLAS_FOUND
}
)
ADD_DEPENDENCIES
(
cblas extern_openblas
)
ADD_DEPENDENCIES
(
cblas extern_openblas
)
LIST
(
APPEND external_project_dependencies cblas
)
LIST
(
APPEND external_project_dependencies cblas
)
...
...
cmake/external/python.cmake
浏览文件 @
59a8ebc6
...
@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
...
@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF
(
WITH_PYTHON
)
ENDIF
(
WITH_PYTHON
)
SET
(
py_env
""
)
SET
(
py_env
""
)
SET
(
USE_VIRTUALENV_FOR_TEST 1
)
IF
(
PYTHONINTERP_FOUND
)
IF
(
PYTHONINTERP_FOUND
)
find_python_module
(
pip REQUIRED
)
find_python_module
(
pip REQUIRED
)
find_python_module
(
numpy REQUIRED
)
find_python_module
(
numpy REQUIRED
)
...
...
cmake/flags.cmake
浏览文件 @
59a8ebc6
...
@@ -115,7 +115,7 @@ set(COMMON_FLAGS
...
@@ -115,7 +115,7 @@ set(COMMON_FLAGS
-Wno-error=literal-suffix
-Wno-error=literal-suffix
-Wno-error=sign-compare
-Wno-error=sign-compare
-Wno-error=unused-local-typedefs
-Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality
# Warnings in
P
ybind11
-Wno-error=parentheses-equality
# Warnings in
p
ybind11
)
)
set
(
GPU_COMMON_FLAGS
set
(
GPU_COMMON_FLAGS
...
@@ -195,6 +195,7 @@ endif()
...
@@ -195,6 +195,7 @@ endif()
# Modern gpu architectures: Pascal
# Modern gpu architectures: Pascal
if
(
CUDA_VERSION VERSION_GREATER
"8.0"
OR CUDA_VERSION VERSION_EQUAL
"8.0"
)
if
(
CUDA_VERSION VERSION_GREATER
"8.0"
OR CUDA_VERSION VERSION_EQUAL
"8.0"
)
list
(
APPEND __arch_flags
" -gencode arch=compute_60,code=sm_60"
)
list
(
APPEND __arch_flags
" -gencode arch=compute_60,code=sm_60"
)
list
(
APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr
)
endif
()
endif
()
# Custom gpu architecture
# Custom gpu architecture
...
...
cmake/generic.cmake
浏览文件 @
59a8ebc6
...
@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
...
@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
protobuf_generate_python
(
py_srcs
${
py_proto_compile_SRCS
}
)
protobuf_generate_python
(
py_srcs
${
py_proto_compile_SRCS
}
)
add_custom_target
(
${
TARGET_NAME
}
ALL DEPENDS
${
py_srcs
}
)
add_custom_target
(
${
TARGET_NAME
}
ALL DEPENDS
${
py_srcs
}
)
endfunction
()
endfunction
()
function
(
py_test TARGET_NAME
)
if
(
WITH_TESTING
)
set
(
options STATIC static SHARED shared
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
py_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_test
(
NAME
${
TARGET_NAME
}
COMMAND env PYTHONPATH=
${
PADDLE_PYTHON_PACKAGE_DIR
}
python2
${
py_test_SRCS
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
endif
()
endfunction
()
cmake/util.cmake
浏览文件 @
59a8ebc6
...
@@ -149,9 +149,12 @@ endfunction()
...
@@ -149,9 +149,12 @@ endfunction()
# Create a python unittest using run_python_tests.sh,
# Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment
# which takes care of making correct running environment
function
(
add_python_test TEST_NAME
)
function
(
add_python_test TEST_NAME
)
add_test
(
NAME
${
TEST_NAME
}
foreach
(
arg
${
ARGN
}
)
COMMAND env PADDLE_PACKAGE_DIR=
${
PADDLE_PYTHON_PACKAGE_DIR
}
get_filename_component
(
py_fn
${
arg
}
NAME_WE
)
bash
${
PROJ_ROOT
}
/paddle/scripts/run_python_tests.sh
set
(
TRG_NAME
${
TEST_NAME
}
_
${
py_fn
}
)
${
USE_VIRTUALENV_FOR_TEST
}
${
PYTHON_EXECUTABLE
}
${
ARGN
}
add_test
(
NAME
${
TRG_NAME
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
COMMAND env PYTHONPATH=
${
PADDLE_PYTHON_PACKAGE_DIR
}
python2
${
arg
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
endforeach
()
endfunction
()
endfunction
()
doc/design/mkldnn/README.MD
0 → 100644
浏览文件 @
59a8ebc6
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(
**MKL-DNN**
\[
[
1
](
#references
)
\]
)集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
-
完成常用layer的MKL-DNN实现。
-
完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
## Contents
-
[
Overview
](
#overview
)
-
[
Actions
](
#actions
)
-
[
CMake
](
#cmake
)
-
[
Layers
](
#layers
)
-
[
Activations
](
#activations
)
-
[
Unit Tests
](
#unit-tests
)
-
[
Protobuf Messages
](
#protobuf-messages
)
-
[
Python API
](
#python-api
)
-
[
Demos
](
#demos
)
-
[
Benchmarking
](
#benchmarking
)
-
[
Others
](
#others
)
-
[
Design Concerns
](
#design-concerns
)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
<div
align=
"center"
>
<img
src=
"image/overview.png"
width=
350
><br/>
Figure 1. PaddlePaddle on IA.
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在
`CMakeLists.txt`
中会添加
`WITH_MKLDNN`
的选项,当设置这个值为
`ON`
的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
同时,我们会引入
`WITH_MKLML`
选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
所以,我们会在
`cmake/external`
目录新建
`mkldnn.cmake`
和
`mklml.cmake`
文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
**备注**
:当
`WITH_MKLML=ON`
的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动
`cmake/cblas.cmake`
中的逻辑。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`
中,并且文件名都会一以
*Mkldnn*
开头。
所有MKL-DNN的layers都会继承于一个叫做
`MkldnnLayer`
的父类,该父类继承于PaddlePaddle的基类
`Layer`
。
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在
`paddle/gserver/activations`
目录下添加一个
`MkldnnActivation.h`
文件定义一些用于MKL-DNN的接口,实现方法还是会在
`ActivationFunction.cpp`
文件。
### Unit Tests
会在
`paddle/gserver/test`
目录下添加
`test_Mkldnn.cpp`
和
`MkldnnTester.*`
用于MKL-DNN的测试。
Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
### Protobuf Messages
根据具体layer的需求可能会在
`proto/ModelConfig.proto`
里面添加必要的选项。
### Python API
目前只考虑
**v1 API**
。
计划在
`python/paddle/trainer/config_parser.py`
里面添加
`use_mkldnn`
这个选择,方便用户选择使用MKL-DNN的layers。
具体实现方式比如:
```
python
use_mkldnn
=
bool
(
int
(
g_command_config_args
.
get
(
"use_mkldnn"
,
0
)))
if
use_mkldnn
self
.
layer_type
=
mkldnn_
*
```
所有MKL-DNN的layer type会以
*mkldnn_*
开头,以示区分。
并且可能在
`python/paddle/trainer_config_helper`
目录下的
`activations.py `
和
`layers.py`
里面添加必要的MKL-DNN的接口。
### Demos
会在
`v1_api_demo`
目录下添加一个
`mkldnn`
的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
会考虑添加部分逻辑在
`benchmark/paddle/image/run.sh`
,添加使用MKL-DNN的测试。
### Others
1.
如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
2.
深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格
\[
[
2
](
#references
)
\]
,同时又尽可能少的牺牲MKL-DNN的性能
\[
[
3
](
#references
)
\]
。
我们总结出一些特别需要注意的点:
1.
使用
**deviceId_**
。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的
`deviceId_`
变量来区分layer的属性,定义
`-2`
为
`MkldnnLayer`
特有的设备ID。
2.
重写父类Layer的
**init**
函数,修改
`deviceId_`
为
`-2`
,代表这个layer是用于跑在MKL-DNN的环境下。
3.
创建
`MkldnnMatrix`
,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4.
创建
`MkldnnBase`
,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到
`MkldnnStream`
和
`CpuEngine`
,和未来可能还会用到
`FPGAEngine`
等。
5.
在
**Argument**
里添加两个
`MkldnnMatrixPtr`
,取名为
`mkldnnValue`
和
`mkldnnGrad`
,用于存放
`MkldnnLayer`
会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6.
在父类
`Layer`
中的
`getOutput`
函数中添加一段逻辑,用于判断
`deviceId`
,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用
`Argument`
的cvt函数把output统一到需要的device上。
7.
在原来的
`FLAGS`
中添加一个
`use_mkldnn`
的flag,用于选择是否使用MKL-DNN的相关功能。
## References
1.
[
Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)
](
https://github.com/01org/mkl-dnn
"Intel MKL-DNN"
)
2.
[
原来的方案
](
https://github.com/PaddlePaddle/Paddle/pull/3096
)
会引入
**nextLayer**
的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3.
MKL-DNN的高性能格式与PaddlePaddle原有的
`NCHW`
不同(PaddlePaddle中的CUDNN部分使用的也是
`NCHW`
,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
doc/design/mkldnn/image/overview.png
0 → 100644
浏览文件 @
59a8ebc6
9.7 KB
paddle/.set_python_path.sh
浏览文件 @
59a8ebc6
...
@@ -21,22 +21,15 @@
...
@@ -21,22 +21,15 @@
#
#
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
#
#
PYPATH
=
""
if
!
python
-c
"import paddle"
>
/dev/null 2>/dev/null
;
then
set
-x
PYPATH
=
""
while
getopts
"d:"
opt
;
do
set
-x
case
$opt
in
while
getopts
"d:"
opt
;
do
d
)
case
$opt
in
PYPATH
=
$OPTARG
d
)
;;
PYPATH
=
$OPTARG
esac
;;
done
esac
shift
$((
$OPTIND
-
1
))
done
export
PYTHONPATH
=
$PYPATH
:
$PYTHONPATH
shift
$((
$OPTIND
-
1
))
$@
export
PYTHONPATH
=
$PYPATH
:
$PYTHONPATH
$@
else
echo
"paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo
"Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit
1
fi
paddle/api/test/CMakeLists.txt
浏览文件 @
59a8ebc6
add_python_test
(
test_swig_api
py_test
(
testTrain SRCS testTrain.py
)
testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py
)
py_test
(
testMatrix SRCS testMatrix.py
)
py_test
(
testVector SRCS testVector.py
)
py_test
(
testTrainer SRCS testTrainer.py
)
py_test
(
testArguments SRCS testArguments.py
)
py_test
(
testGradientMachine SRCS testGradientMachine.py
)
paddle/cuda/src/hl_batch_transpose.cu
浏览文件 @
59a8ebc6
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_batch_transpose.h"
const
int
TILE_DIM
=
64
;
const
int
TILE_DIM
=
64
;
const
int
BLOCK_ROWS
=
16
;
const
int
BLOCK_ROWS
=
16
;
// No bank-conflict transpose for a batch of data.
// No bank-conflict transpose for a batch of data.
__global__
void
batchTransposeNoBankConflicts
(
real
*
odata
,
__global__
void
batchTransposeNoBankConflicts
(
const
real
*
idata
,
real
*
odata
,
const
real
*
idata
,
int
numSamples
,
int
width
,
int
height
)
{
int
numSamples
,
int
width
,
int
height
)
{
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX
]
=
tile
[
threadIdx
.
x
][
j
];
newX
]
=
tile
[
threadIdx
.
x
][
j
];
}
}
void
batchTranspose
(
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
void
batchTranspose
(
int
batchSize
)
{
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
int
batchSize
)
{
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
batchSize
,
width
,
height
);
output
,
input
,
batchSize
,
width
,
height
);
CHECK_SYNC
(
"batchTranspose failed!"
);
CHECK_SYNC
(
"batchTranspose failed!"
);
}
}
paddle/cuda/src/hl_cuda_aggregate.cu
浏览文件 @
59a8ebc6
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh"
#include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
/**
/**
* @brief matrix row operator.
* @brief matrix row operator.
*/
*/
template
<
class
Agg
,
int
blockSize
>
template
<
class
Agg
,
int
blockSize
>
__global__
void
KeMatrixRowOp
(
Agg
agg
,
__global__
void
KeMatrixRowOp
(
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimN
)
{
real
*
E
,
real
*
Sum
,
int
dimN
)
{
__shared__
real
sum_s
[
blockSize
];
__shared__
real
sum_s
[
blockSize
];
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
index
=
rowId
*
dimN
;
int
index
=
rowId
*
dimN
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
lmt
=
tid
;
int
lmt
=
tid
;
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s
[
tid
]
=
tmp
;
sum_s
[
tid
]
=
tmp
;
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
}
}
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_row_op
(
Agg
agg
,
void
hl_matrix_row_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
int
blocksX
=
dimM
;
int
blocksX
=
dimM
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimN
);
agg
,
A_d
,
C_d
,
dimN
);
}
}
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
sum
(),
hl_matrix_row_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
}
}
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
max
(),
hl_matrix_row_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
}
}
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
min
(),
hl_matrix_row_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
}
}
/**
/**
* @brief matrix column operator.
* @brief matrix column operator.
*/
*/
template
<
class
Agg
>
template
<
class
Agg
>
__global__
void
KeMatrixColumnOp
(
Agg
agg
,
__global__
void
KeMatrixColumnOp
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
int
dimM
,
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
...
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
...
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
}
}
}
}
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
__global__
void
KeMatrixColumnOp_S
(
Agg
agg
,
__global__
void
KeMatrixColumnOp_S
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
dimM
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimN
)
{
int
index
=
threadIdx
.
y
;
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
threadIdx
.
y
;
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index
+=
blockDimY
;
index
+=
blockDimY
;
}
}
}
}
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
__syncthreads
();
__syncthreads
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
if
(
threadIdx
.
y
==
0
)
{
if
(
threadIdx
.
y
==
0
)
{
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
}
}
Sum
[
rowIdx
]
=
tmp
;
Sum
[
rowIdx
]
=
tmp
;
}
}
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_column_op
(
Agg
agg
,
void
hl_matrix_column_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
if
(
dimN
>=
8192
)
{
if
(
dimN
>=
8192
)
{
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
32
,
32
);
dim3
threads
(
32
,
32
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
}
return
;
return
;
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
sum
(),
hl_matrix_column_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
}
}
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
max
(),
hl_matrix_column_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
}
}
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
min
(),
hl_matrix_column_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
}
}
...
@@ -226,16 +184,16 @@ template <int blockSize>
...
@@ -226,16 +184,16 @@ template <int blockSize>
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
E
[
index
];
sum_s
[
tid
]
+=
E
[
index
];
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
...
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
...
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
paddle/cuda/src/hl_cuda_cnn.cu
浏览文件 @
59a8ebc6
...
@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <float.h>
#include <float.h>
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cnn.h"
#include "hl_cnn.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
__global__
void
KeMaxPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeMaxPoolForward
(
const
int
nthreads
,
const
int
channels
,
const
int
height
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
ksizeW
,
const
int
ksizeH
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
ksizeW
,
const
int
offsetH
,
const
int
offsetW
,
const
int
ksizeH
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
strideH
,
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
strideW
,
const
int
offsetH
,
const
int
offsetW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
pw
=
index
%
pooledW
;
int
pw
=
index
%
pooledW
;
int
ph
=
(
index
/
pooledW
)
%
pooledH
;
int
ph
=
(
index
/
pooledW
)
%
pooledH
;
...
@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
...
@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
maxval
=
inputData
[
h
*
width
+
w
];
maxval
=
inputData
[
h
*
width
+
w
];
}
}
}
}
int
tgtIndex
=
index
%
(
pooledW
*
pooledH
*
channels
)
+
int
tgtIndex
=
frameNum
*
tgtStride
;
index
%
(
pooledW
*
pooledH
*
channels
)
+
frameNum
*
tgtStride
;
tgtData
[
tgtIndex
]
=
maxval
;
tgtData
[
tgtIndex
]
=
maxval
;
}
}
}
}
void
hl_maxpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_maxpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMaxPoolForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxPoolForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
channels
,
height
,
width
,
inputData
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
channels
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
height
,
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
CHECK_SYNC
(
"hl_maxpool_forward failed"
);
CHECK_SYNC
(
"hl_maxpool_forward failed"
);
}
}
__global__
void
KeMaxPoolBackward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeMaxPoolBackward
(
const
int
nthreads
,
const
real
*
outData
,
const
real
*
outGrad
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
real
*
outData
,
const
real
*
outGrad
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
sizeX
,
const
int
padH
,
const
int
padW
,
const
int
sizeY
,
real
scaleA
,
real
scaleB
,
const
int
strideH
,
real
*
targetGrad
,
const
int
outStride
)
{
const
int
strideW
,
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
padH
,
const
int
padW
,
real
scaleA
,
real
scaleB
,
real
*
targetGrad
,
const
int
outStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
// find out the local index
// find out the local index
// find out the local offset
// find out the local offset
...
@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
...
@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
}
}
}
}
}
}
targetGrad
[
index
]
=
targetGrad
[
index
]
=
scaleB
*
targetGrad
[
index
]
+
scaleA
*
gradient
;
scaleB
*
targetGrad
[
index
]
+
scaleA
*
gradient
;
}
}
}
}
void
hl_maxpool_backward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_maxpool_backward
(
const
int
frameCnt
,
const
real
*
outData
,
const
real
*
outGrad
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
real
*
outData
,
const
int
width
,
const
real
*
outGrad
,
const
int
pooledH
,
const
int
pooledW
,
const
int
channels
,
const
int
sizeX
,
const
int
sizeY
,
const
int
height
,
const
int
strideH
,
const
int
strideW
,
const
int
width
,
const
int
paddingH
,
const
int
paddingW
,
const
int
pooledH
,
real
scaleA
,
real
scaleB
,
const
int
pooledW
,
real
*
targetGrad
,
const
int
outStride
)
{
const
int
sizeX
,
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
scaleA
,
real
scaleB
,
real
*
targetGrad
,
const
int
outStride
)
{
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeMaxPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeMaxPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
outData
,
outGrad
,
channels
,
inputData
,
height
,
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
outData
,
strideH
,
strideW
,
outGrad
,
paddingH
,
paddingW
,
channels
,
scaleA
,
scaleB
,
height
,
targetGrad
,
outStride
);
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
scaleA
,
scaleB
,
targetGrad
,
outStride
);
CHECK_SYNC
(
"hl_maxpool_backward"
);
CHECK_SYNC
(
"hl_maxpool_backward"
);
}
}
__global__
void
KeAvgPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeAvgPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
padH
,
const
int
padW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
padH
,
const
int
padW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
pw
=
index
%
pooledW
;
int
pw
=
index
%
pooledW
;
...
@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
...
@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
aveval
+=
inputData
[
h
*
width
+
w
];
aveval
+=
inputData
[
h
*
width
+
w
];
}
}
}
}
int
tgtIndex
=
index
%
(
pooledW
*
pooledH
*
channels
)
+
int
tgtIndex
=
frameNum
*
tgtStride
;
index
%
(
pooledW
*
pooledH
*
channels
)
+
frameNum
*
tgtStride
;
tgtData
[
tgtIndex
]
=
aveval
/
pool_size
;
tgtData
[
tgtIndex
]
=
aveval
/
pool_size
;
}
}
}
}
void
hl_avgpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_avgpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeAvgPoolForward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeAvgPoolForward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
channels
,
inputData
,
height
,
width
,
pooledH
,
pooledW
,
channels
,
sizeX
,
sizeY
,
strideH
,
strideW
,
height
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
CHECK_SYNC
(
"hl_avgpool_forward failed"
);
CHECK_SYNC
(
"hl_avgpool_forward failed"
);
}
}
__global__
void
KeAvgPoolBackward
(
const
int
nthreads
,
const
real
*
outGrad
,
__global__
void
KeAvgPoolBackward
(
const
int
nthreads
,
const
int
channels
,
const
int
height
,
const
real
*
outGrad
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
sizeX
,
const
int
padH
,
const
int
padW
,
const
int
sizeY
,
real
scaleA
,
real
scaleB
,
const
int
strideH
,
real
*
tgtGrad
,
const
int
outStride
)
{
const
int
strideW
,
const
int
padH
,
const
int
padW
,
real
scaleA
,
real
scaleB
,
real
*
tgtGrad
,
const
int
outStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
offsetW
=
index
%
width
+
padW
;
int
offsetW
=
index
%
width
+
padW
;
...
@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
...
@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
real
gradient
=
0
;
real
gradient
=
0
;
outGrad
+=
(
frameNum
*
outStride
+
offsetC
*
pooledH
*
pooledW
);
outGrad
+=
(
frameNum
*
outStride
+
offsetC
*
pooledH
*
pooledW
);
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
// figure out the pooling size
// figure out the pooling size
...
@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
...
@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
int
hend
=
min
(
hstart
+
sizeY
,
height
+
padH
);
int
hend
=
min
(
hstart
+
sizeY
,
height
+
padH
);
int
wend
=
min
(
wstart
+
sizeX
,
width
+
padW
);
int
wend
=
min
(
wstart
+
sizeX
,
width
+
padW
);
int
poolsize
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
poolsize
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
gradient
+=
outGrad
[
ph
*
pooledW
+
pw
]
/
poolsize
;
gradient
+=
outGrad
[
ph
*
pooledW
+
pw
]
/
poolsize
;
}
}
}
}
tgtGrad
[
index
]
=
scaleB
*
tgtGrad
[
index
]
+
scaleA
*
gradient
;
tgtGrad
[
index
]
=
scaleB
*
tgtGrad
[
index
]
+
scaleA
*
gradient
;
}
}
}
}
void
hl_avgpool_backward
(
const
int
frameCnt
,
const
real
*
outGrad
,
void
hl_avgpool_backward
(
const
int
frameCnt
,
const
real
*
outGrad
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
scaleA
,
real
scaleB
,
const
int
sizeY
,
real
*
backGrad
,
const
int
outStride
)
{
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
scaleA
,
real
scaleB
,
real
*
backGrad
,
const
int
outStride
)
{
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeAvgPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeAvgPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
outGrad
,
channels
,
height
,
width
,
outGrad
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
channels
,
strideH
,
strideW
,
height
,
paddingH
,
paddingW
,
width
,
scaleA
,
scaleB
,
pooledH
,
backGrad
,
outStride
);
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
scaleA
,
scaleB
,
backGrad
,
outStride
);
CHECK_SYNC
(
"hl_avgpool_backward failed"
);
CHECK_SYNC
(
"hl_avgpool_backward failed"
);
}
}
...
@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
...
@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
const
size_t
numChannels
,
const
size_t
numChannels
,
const
real
ratioH
,
const
real
ratioH
,
const
real
ratioW
)
{
const
real
ratioW
)
{
int
nthreads
=
outputH
*
outputW
;
int
nthreads
=
outputH
*
outputW
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
if
(
tid
<
nthreads
)
{
int
outIdH
=
tid
/
outputW
;
int
outIdH
=
tid
/
outputW
;
...
@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
...
@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
w2lambda
=
1.
f
-
w1lambda
;
const
real
*
inPos
=
const
real
*
inPos
=
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
inImgIdy
*
inImgW
+
inImgIdx
];
inImgIdy
*
inImgW
+
inImgIdx
];
// bilinear interpolation
// bilinear interpolation
out
[
outIdH
*
outputW
+
outIdW
]
=
out
[
outIdH
*
outputW
+
outIdW
]
=
h2lambda
*
(
w2lambda
*
inPos
[
0
]
+
w1lambda
*
inPos
[
wId
])
+
h2lambda
*
(
w2lambda
*
inPos
[
0
]
+
w1lambda
*
inPos
[
wId
])
+
h1lambda
*
(
w2lambda
*
inPos
[
hId
*
inImgW
]
+
w1lambda
*
inPos
[
hId
*
inImgW
+
wId
]);
h1lambda
*
(
w2lambda
*
inPos
[
hId
*
inImgW
]
+
w1lambda
*
inPos
[
hId
*
inImgW
+
wId
]);
}
}
}
}
...
@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
...
@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
int
threadNum
=
outputH
*
outputW
;
int
threadNum
=
outputH
*
outputW
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinearInterpFw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
KeBilinearInterpFw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
inData
,
inData
,
inImgH
,
inImgW
,
inputH
,
inputW
,
outData
,
outImgH
,
inImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
inImgW
,
inputH
,
inputW
,
outData
,
outImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
CHECK_SYNC
(
"hl_bilinear_forward failed"
);
CHECK_SYNC
(
"hl_bilinear_forward failed"
);
}
}
...
@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
...
@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
*
inPos
=
real
*
inPos
=
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
inImgIdy
*
inImgW
+
inImgIdx
];
inImgIdy
*
inImgW
+
inImgIdx
];
const
real
*
outPos
=
&
out
[
outIdH
*
outputW
+
outIdW
];
const
real
*
outPos
=
&
out
[
outIdH
*
outputW
+
outIdW
];
paddle
::
paddleAtomicAdd
(
&
inPos
[
0
],
h2lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
0
],
h2lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
wId
],
h2lambda
*
w1lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
wId
],
h2lambda
*
w1lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
],
h1lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
],
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
+
wId
],
h1lambda
*
w1lambda
*
outPos
[
0
]);
h1lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
+
wId
],
h1lambda
*
w1lambda
*
outPos
[
0
]);
}
}
}
}
...
@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
...
@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
int
threadNum
=
outputH
*
outputW
;
int
threadNum
=
outputH
*
outputW
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinearInterpBw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
KeBilinearInterpBw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
inGrad
,
inImgH
,
inImgW
,
inputH
,
inputW
,
outGrad
,
outImgH
,
inImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
inImgW
,
inputH
,
inputW
,
outGrad
,
outImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
CHECK_SYNC
(
"hl_bilinear_backward failed"
);
CHECK_SYNC
(
"hl_bilinear_backward failed"
);
}
}
__global__
void
maxoutFpCompute
(
size_t
nthreads
,
const
real
*
inData
,
__global__
void
maxoutFpCompute
(
size_t
nthreads
,
real
*
outData
,
int
*
idData
,
const
real
*
inData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
real
*
outData
,
int
*
idData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
size_t
batch_idx
=
index
/
size
;
size_t
batch_idx
=
index
/
size
;
size_t
i
=
index
%
size
;
size_t
i
=
index
%
size
;
size_t
channel_idx
=
i
/
featLen
;
size_t
channel_idx
=
i
/
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
data_idx
=
(
batch_idx
*
size
+
channel_idx
*
featLen
)
*
groups
+
feat_idx
;
size_t
data_idx
=
(
batch_idx
*
size
+
channel_idx
*
featLen
)
*
groups
+
feat_idx
;
real
max
=
inData
[
data_idx
];
real
max
=
inData
[
data_idx
];
int
maxId
=
0
;
int
maxId
=
0
;
for
(
size_t
g
=
1
;
g
<
groups
;
++
g
)
{
for
(
size_t
g
=
1
;
g
<
groups
;
++
g
)
{
...
@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
...
@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
}
}
}
}
void
hl_maxout_forward
(
const
real
*
inData
,
real
*
outData
,
void
hl_maxout_forward
(
const
real
*
inData
,
int
*
idData
,
size_t
batchSize
,
size_t
size
,
real
*
outData
,
size_t
featLen
,
size_t
groups
)
{
int
*
idData
,
size_t
batchSize
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
num_kernels
=
size
*
batchSize
;
int
num_kernels
=
size
*
batchSize
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
maxoutFpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
maxoutFpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
inData
,
outData
,
idData
,
size
,
featLen
,
groups
);
num_kernels
,
inData
,
outData
,
idData
,
size
,
featLen
,
groups
);
CHECK_SYNC
(
"hl_maxout_forward failed"
);
CHECK_SYNC
(
"hl_maxout_forward failed"
);
}
}
__global__
void
maxoutBpCompute
(
size_t
nthreads
,
real
*
inGrad
,
__global__
void
maxoutBpCompute
(
size_t
nthreads
,
const
real
*
outGrad
,
const
int
*
idData
,
real
*
inGrad
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
const
real
*
outGrad
,
const
int
*
idData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
size_t
batch_idx
=
index
/
size
;
size_t
batch_idx
=
index
/
size
;
size_t
i
=
index
%
size
;
size_t
i
=
index
%
size
;
size_t
channel_idx
=
i
/
featLen
;
size_t
channel_idx
=
i
/
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
newIndex
=
batch_idx
*
size
;
size_t
newIndex
=
batch_idx
*
size
;
size_t
gradIdx
=
(
channel_idx
*
groups
+
(
idData
+
newIndex
)[
i
])
*
featLen
+
feat_idx
;
size_t
gradIdx
=
(
channel_idx
*
groups
+
(
idData
+
newIndex
)[
i
])
*
featLen
+
feat_idx
;
(
inGrad
+
newIndex
*
groups
)[
gradIdx
]
+=
(
outGrad
+
newIndex
)[
i
];
(
inGrad
+
newIndex
*
groups
)[
gradIdx
]
+=
(
outGrad
+
newIndex
)[
i
];
}
}
}
}
void
hl_maxout_backward
(
real
*
inGrad
,
const
real
*
outGrad
,
void
hl_maxout_backward
(
real
*
inGrad
,
const
int
*
idData
,
size_t
batchSize
,
size_t
size
,
const
real
*
outGrad
,
size_t
featLen
,
size_t
groups
)
{
const
int
*
idData
,
size_t
batchSize
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
num_kernels
=
size
*
batchSize
;
int
num_kernels
=
size
*
batchSize
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
maxoutBpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
maxoutBpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
inGrad
,
outGrad
,
idData
,
size
,
featLen
,
groups
);
num_kernels
,
inGrad
,
outGrad
,
idData
,
size
,
featLen
,
groups
);
CHECK_SYNC
(
"hl_maxout_backward failed"
);
CHECK_SYNC
(
"hl_maxout_backward failed"
);
}
}
paddle/cuda/src/hl_cuda_lstm.cu
浏览文件 @
59a8ebc6
...
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_activation_functions.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda_cublas.h"
#include "hl_cuda_cublas.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "hl_activation_functions.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
typedef
hppl
::
Active
<
real
>::
forward
t_forward
;
typedef
hppl
::
Active
<
real
>::
forward
t_forward
;
typedef
hppl
::
Active
<
real
>::
backward
t_backward
;
typedef
hppl
::
Active
<
real
>::
backward
t_backward
;
bool
hl_lstm_sequence_parallel
(
int
frameSize
)
{
bool
hl_lstm_sequence_parallel
(
int
frameSize
)
{
...
@@ -42,9 +41,9 @@ public:
...
@@ -42,9 +41,9 @@ public:
value_
+=
(
start
+
length
-
1
)
*
frameSize
+
idx
;
value_
+=
(
start
+
length
-
1
)
*
frameSize
+
idx
;
}
}
}
}
__device__
inline
real
*
getPtr
()
const
{
return
value_
;
}
__device__
inline
real
*
getPtr
()
const
{
return
value_
;
}
__device__
inline
real
getValue
()
{
return
*
value_
;
}
__device__
inline
real
getValue
()
{
return
*
value_
;
}
__device__
inline
void
setValue
(
real
value
)
{
*
value_
=
value
;
}
__device__
inline
void
setValue
(
real
value
)
{
*
value_
=
value
;
}
template
<
int
reversed
,
int
frameSize
>
template
<
int
reversed
,
int
frameSize
>
__device__
inline
void
nextFrame
()
{
__device__
inline
void
nextFrame
()
{
if
(
reversed
==
0
)
{
if
(
reversed
==
0
)
{
...
@@ -55,28 +54,25 @@ public:
...
@@ -55,28 +54,25 @@ public:
}
}
};
};
__device__
__forceinline__
__device__
__forceinline__
void
ptx_sync
(
const
int
id
,
const
int
barriers
)
{
void
ptx_sync
(
const
int
id
,
const
int
barriers
)
{
asm
volatile
(
"bar.sync %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
asm
volatile
(
"bar.sync %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
}
}
__device__
__forceinline__
__device__
__forceinline__
void
ptx_arrive
(
const
int
id
,
const
int
barriers
)
{
void
ptx_arrive
(
const
int
id
,
const
int
barriers
)
{
asm
volatile
(
"bar.arrive %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
asm
volatile
(
"bar.arrive %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
__forceinline__
real
__device__
__forceinline__
real
forward_sequence
(
real
value
,
forward_sequence
(
real
value
,
real
*
shValue
,
real
*
shValue
,
real
*
state
,
real
*
state
,
real
*
preOutput
,
real
*
preOutput
,
real
*
output
,
real
*
output
,
real
check
,
real
check
,
int
index
,
int
index
,
t_forward
activeNode
,
t_forward
activeNode
,
t_forward
activeGate
,
t_forward
activeGate
,
t_forward
activeState
)
{
t_forward
activeState
)
{
real
out
;
real
out
;
real
prevOut
;
real
prevOut
;
real
state_r
;
real
state_r
;
...
@@ -112,17 +108,20 @@ forward_sequence(real value,
...
@@ -112,17 +108,20 @@ forward_sequence(real value,
if
(
idy
==
0
)
{
if
(
idy
==
0
)
{
ptx_sync
(
2
,
frameSize
*
2
);
ptx_sync
(
2
,
frameSize
*
2
);
prevOut
=
state
[
idx
];
prevOut
=
state
[
idx
];
prevOut
=
activeState
(
prevOut
);
prevOut
=
activeState
(
prevOut
);
preOutput
[
idx
]
=
prevOut
;
preOutput
[
idx
]
=
prevOut
;
ptx_arrive
(
3
,
frameSize
*
2
);
ptx_arrive
(
3
,
frameSize
*
2
);
}
}
return
value
;
return
value
;
}
}
#define OUTPUT_BARRIER_ID 10
#define OUTPUT_BARRIER_ID 10
#define OUTPUT_BARRIER_ID2 11
#define OUTPUT_BARRIER_ID2 11
template
<
int
valueSize
,
int
frameSize
,
int
reversed
,
template
<
int
valueSize
,
int
computeThreads
,
int
blockSize
>
int
frameSize
,
int
reversed
,
int
computeThreads
,
int
blockSize
>
__global__
void
KeLstmForward
(
real
*
gateValue
,
__global__
void
KeLstmForward
(
real
*
gateValue
,
real
*
state
,
real
*
state
,
real
*
output
,
real
*
output
,
...
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
}
}
}
}
value
=
forward_sequence
<
valueSize
,
frameSize
>
(
value
=
forward_sequence
<
valueSize
,
frameSize
>
(
value
,
shValue
,
shState
,
shPrevOutput
,
shOutput
,
check
,
index
,
value
,
hppl
::
gpu
::
forward
[
active_node
],
shValue
,
hppl
::
gpu
::
forward
[
active_gate
],
shState
,
hppl
::
gpu
::
forward
[
active_state
]);
shPrevOutput
,
shOutput
,
check
,
index
,
hppl
::
gpu
::
forward
[
active_node
],
hppl
::
gpu
::
forward
[
active_gate
],
hppl
::
gpu
::
forward
[
active_state
]);
const
int
idx
=
index
%
frameSize
;
const
int
idx
=
index
%
frameSize
;
const
int
idy
=
index
/
frameSize
;
const
int
idy
=
index
/
frameSize
;
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
...
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
real
B_r
[
frameSize
];
real
B_r
[
frameSize
];
const
int
computeIdx
=
index
-
valueSize
;
const
int
computeIdx
=
index
-
valueSize
;
if
(
i
==
0
)
{
if
(
i
==
0
)
{
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
B_r
[
n
]
=
weight
[
n
*
valueSize
+
computeIdx
];
B_r
[
n
]
=
weight
[
n
*
valueSize
+
computeIdx
];
}
}
...
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
A_r
[
n
]
*
B_r
[
n
];
sum
+=
A_r
[
n
]
*
B_r
[
n
];
}
}
shValue
[
computeIdx
]
=
sum
;
shValue
[
computeIdx
]
=
sum
;
ptx_arrive
(
OUTPUT_BARRIER_ID2
,
blockSize
);
ptx_arrive
(
OUTPUT_BARRIER_ID2
,
blockSize
);
...
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
if
(
valueSize
==
256
)
{
if
(
valueSize
==
256
)
{
real
B_r
[
frameSize
];
real
B_r
[
frameSize
];
if
(
i
==
0
)
{
if
(
i
==
0
)
{
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
B_r
[
n
]
=
weight
[
n
*
valueSize
+
index
];
B_r
[
n
]
=
weight
[
n
*
valueSize
+
index
];
}
}
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
shOutput
[
n
]
*
B_r
[
n
];
sum
+=
shOutput
[
n
]
*
B_r
[
n
];
}
}
value
+=
sum
;
value
+=
sum
;
}
}
...
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
...
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmForward
<
128
,
32
,
0
,
128
,
256
>
KeLstmForward
<
128
,
32
,
0
,
128
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmForward
<
256
,
64
,
0
,
256
,
256
>
KeLstmForward
<
256
,
64
,
0
,
256
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
else
{
}
else
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmForward
<
128
,
32
,
1
,
128
,
256
>
KeLstmForward
<
128
,
32
,
1
,
128
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmForward
<
256
,
64
,
1
,
256
,
256
>
KeLstmForward
<
256
,
64
,
1
,
256
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
}
CHECK_SYNC
(
"hl_lstm_parallel_forward failed"
);
CHECK_SYNC
(
"hl_lstm_parallel_forward failed"
);
}
}
__device__
__forceinline__
__device__
__forceinline__
void
transpose_32x32
(
real
a
[],
const
int
idx
)
{
void
transpose_32x32
(
real
a
[],
const
int
idx
)
{
int
addr
=
idx
%
32
;
int
addr
=
idx
%
32
;
#pragma unroll
#pragma unroll
for
(
int
k
=
1
;
k
<
32
;
k
++
)
{
for
(
int
k
=
1
;
k
<
32
;
k
++
)
{
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
addr
=
__shfl
(
addr
,
(
idx
+
1
)
%
32
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
1
)
%
32
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
}
}
#pragma unroll
#pragma unroll
for
(
int
tid
=
0
;
tid
<
31
;
tid
++
)
{
for
(
int
tid
=
0
;
tid
<
31
;
tid
++
)
{
real
tmp
=
(
idx
>
tid
)
?
a
[
0
]
:
a
[
1
];
real
tmp
=
(
idx
>
tid
)
?
a
[
0
]
:
a
[
1
];
#pragma unroll
#pragma unroll
for
(
int
k
=
31
;
k
>
0
;
k
--
)
{
for
(
int
k
=
31
;
k
>
0
;
k
--
)
{
a
[(
k
+
1
)
%
32
]
=
(
idx
>
tid
)
?
a
[
k
]
:
a
[(
k
+
1
)
%
32
];
a
[(
k
+
1
)
%
32
]
=
(
idx
>
tid
)
?
a
[
k
]
:
a
[(
k
+
1
)
%
32
];
}
}
...
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
...
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
}
}
addr
=
(
32
-
idx
)
%
32
;
addr
=
(
32
-
idx
)
%
32
;
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
32
;
k
++
)
{
for
(
int
k
=
0
;
k
<
32
;
k
++
)
{
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
31
)
%
32
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
31
)
%
32
,
32
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
void
__device__
void
backward_sequence
(
real
rGateValue
,
backward_sequence
(
real
rGateValue
,
real
rOutputGrad
,
real
rOutputGrad
,
real
rPreOutputValue
,
real
rPreOutputValue
,
real
&
rGateGrad
,
real
&
rGateGrad
,
real
&
rStateGrad
,
real
&
rStateGrad
,
real
*
shStateGrad
,
real
*
shStateGrad
,
real
*
shStateValue
,
real
*
shStateValue
,
real
*
shGateValue
,
real
*
shGateValue
,
real
rCheck
,
real
rCheck
,
real
&
rGateValuePrev
,
real
&
rGateValuePrev
,
int
index
,
int
index
,
t_backward
activeNode
,
t_backward
activeNode
,
t_backward
activeGate
,
t_backward
activeGate
,
t_backward
activeState
)
{
t_backward
activeState
)
{
const
int
frameIdx
=
index
%
frameSize
;
const
int
frameIdx
=
index
%
frameSize
;
const
int
frameIdy
=
index
/
frameSize
;
const
int
frameIdy
=
index
/
frameSize
;
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
...
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
...
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
rStateGrad
=
rGateGrad
*
rCheck
;
rStateGrad
=
rGateGrad
*
rCheck
;
shStateGrad
[
index
]
=
rStateGrad
;
shStateGrad
[
index
]
=
rStateGrad
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
];
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
}
else
if
(
frameIdy
==
2
)
{
}
else
if
(
frameIdy
==
2
)
{
...
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
...
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
shStateGrad
[
index
]
=
rStateGrad
;
shStateGrad
[
index
]
=
rStateGrad
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateValuePrev
=
rGateValue
;
rGateValuePrev
=
rGateValue
;
rGateGrad
=
rStateGrad
*
shStateValue
[
frameIdx
];
rGateGrad
=
rStateGrad
*
shStateValue
[
frameIdx
];
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
...
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
...
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
shGateValue
[
frameIdx
]
=
rGateValue
;
shGateValue
[
frameIdx
]
=
rGateValue
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
+
frameSize
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
+
frameSize
];
rGateGrad
=
activeNode
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeNode
(
rGateGrad
,
rGateValue
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
void
load_weight
(
real
rWeight
[],
real
*
weight
,
const
int
index
)
{
__device__
void
load_weight
(
real
rWeight
[],
real
*
weight
,
const
int
index
)
{
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
weight
+=
index
;
weight
+=
index
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
}
}
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
rWeight
,
index
%
32
);
}
}
if
(
valueSize
==
256
)
{
if
(
valueSize
==
256
)
{
int
id
=
(
index
/
32
)
%
2
;
int
id
=
(
index
/
32
)
%
2
;
weight
+=
index
-
id
*
32
+
id
*
32
*
valueSize
;
weight
+=
index
-
id
*
32
+
id
*
32
*
valueSize
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
32
;
n
++
)
{
for
(
int
n
=
0
;
n
<
32
;
n
++
)
{
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
+
32
]
=
weight
[
n
*
valueSize
+
32
];
rWeight
[
n
+
32
]
=
weight
[
n
*
valueSize
+
32
];
}
}
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
&
rWeight
[
32
],
index
%
32
);
transpose_32x32
(
&
rWeight
[
32
],
index
%
32
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
,
int
reversed
>
template
<
int
valueSize
,
int
frameSize
,
int
reversed
>
__global__
void
KeLstmBackward
(
real
*
gateValue
,
__global__
void
KeLstmBackward
(
real
*
gateValue
,
real
*
gateGrad
,
real
*
gateGrad
,
real
*
stateValue
,
real
*
stateValue
,
real
*
stateGrad
,
/* do not need save */
real
*
stateGrad
,
/* do not need save */
real
*
preOutputValue
,
real
*
preOutputValue
,
real
*
preOutputGrad
,
/* do not need save */
real
*
preOutputGrad
,
/* do not need save */
real
*
checkIg
,
real
*
checkIg
,
real
*
checkIgGrad
,
real
*
checkIgGrad
,
real
*
checkFg
,
real
*
checkFg
,
...
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
if
(
i
!=
length
-
1
)
{
if
(
i
!=
length
-
1
)
{
frameStateValue
.
nextFrame
<!
reversed
,
frameSize
>
();
frameStateValue
.
nextFrame
<!
reversed
,
frameSize
>
();
shStateValue
[
frameIdx
]
=
frameStateValue
.
getValue
();
shStateValue
[
frameIdx
]
=
frameStateValue
.
getValue
();
}
else
{
}
else
{
shStateValue
[
frameIdx
]
=
0.0
;
shStateValue
[
frameIdx
]
=
0.0
;
}
}
}
}
backward_sequence
<
valueSize
,
frameSize
>
(
backward_sequence
<
valueSize
,
frameSize
>
(
rGateValue
,
rGateValue
,
rOutputGrad
,
rPreOutputValue
,
rGateGrad
,
rOutputGrad
,
rStateGrad
,
shStateGrad
,
shStateValue
,
shGateValue
,
rPreOutputValue
,
rCheck
,
rGateValuePrev
,
index
,
rGateGrad
,
hppl
::
gpu
::
backward
[
active_node
],
rStateGrad
,
hppl
::
gpu
::
backward
[
active_gate
],
shStateGrad
,
hppl
::
gpu
::
backward
[
active_state
]);
shStateValue
,
shGateValue
,
rCheck
,
rGateValuePrev
,
index
,
hppl
::
gpu
::
backward
[
active_node
],
hppl
::
gpu
::
backward
[
active_gate
],
hppl
::
gpu
::
backward
[
active_state
]);
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rCheckGrad
+=
rGateGrad
*
rStateValue
;
rCheckGrad
+=
rGateGrad
*
rStateValue
;
rStateValue
=
shStateValue
[
frameIdx
];
rStateValue
=
shStateValue
[
frameIdx
];
...
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
shGateGrad
[
frameIdy
][
frameIdx
]
=
rGateGrad
;
shGateGrad
[
frameIdy
][
frameIdx
]
=
rGateGrad
;
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
shGateGrad
[
frameIdy
][
n
]
*
B_r
[
n
];
sum
+=
shGateGrad
[
frameIdy
][
n
]
*
B_r
[
n
];
}
}
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rOutputGrad
+=
sum
;
rOutputGrad
+=
sum
;
...
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
A_r
[
n
]
*
B_r
[
n
];
sum
+=
A_r
[
n
]
*
B_r
[
n
];
}
}
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rOutputGrad
+=
sum
;
rOutputGrad
+=
sum
;
...
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
ptx_sync
(
6
,
valueSize
);
ptx_sync
(
6
,
valueSize
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
rOutputGrad
+=
shOutputGrad
[
i
][
frameIdx
];
rOutputGrad
+=
shOutputGrad
[
i
][
frameIdx
];
}
}
}
else
{
}
else
{
...
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */
/* TODO: Temporary save & merger in another kernel */
if
(
frameIdy
==
1
)
{
if
(
frameIdy
==
1
)
{
if
(
checkIgGrad
)
paddle
::
paddleAtomicAdd
(
checkIgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkIgGrad
)
paddle
::
paddleAtomicAdd
(
checkIgGrad
+
frameIdx
,
rCheckGrad
);
}
else
if
(
frameIdy
==
2
)
{
}
else
if
(
frameIdy
==
2
)
{
if
(
checkFgGrad
)
paddle
::
paddleAtomicAdd
(
checkFgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkFgGrad
)
paddle
::
paddleAtomicAdd
(
checkFgGrad
+
frameIdx
,
rCheckGrad
);
}
else
if
(
frameIdy
==
3
)
{
}
else
if
(
frameIdy
==
3
)
{
if
(
checkOgGrad
)
paddle
::
paddleAtomicAdd
(
checkOgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkOgGrad
)
paddle
::
paddleAtomicAdd
(
checkOgGrad
+
frameIdx
,
rCheckGrad
);
}
}
}
}
...
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
...
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
hl_activation_mode_t
active_node
,
hl_activation_mode_t
active_node
,
hl_activation_mode_t
active_gate
,
hl_activation_mode_t
active_gate
,
hl_activation_mode_t
active_state
)
{
hl_activation_mode_t
active_state
)
{
CHECK
(
frameSize
==
32
||
frameSize
==
64
||
CHECK
(
frameSize
==
32
||
frameSize
==
64
||
frameSize
==
128
||
frameSize
==
128
||
frameSize
==
256
);
frameSize
==
256
);
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmBackward
<
128
,
32
,
0
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
128
,
32
,
0
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmBackward
<
256
,
64
,
0
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
256
,
64
,
0
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
128
)
{
}
else
if
(
frameSize
==
128
)
{
KeLstmBackward
<
512
,
128
,
0
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
512
,
128
,
0
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
256
)
{
}
else
if
(
frameSize
==
256
)
{
KeLstmBackward
<
1024
,
256
,
0
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
1024
,
256
,
0
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
else
{
}
else
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmBackward
<
128
,
32
,
1
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
128
,
32
,
1
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmBackward
<
256
,
64
,
1
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
256
,
64
,
1
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
128
)
{
}
else
if
(
frameSize
==
128
)
{
KeLstmBackward
<
512
,
128
,
1
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
512
,
128
,
1
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
256
)
{
}
else
if
(
frameSize
==
256
)
{
KeLstmBackward
<
1024
,
256
,
1
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
1024
,
256
,
1
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
}
CHECK_SYNC
(
"hl_lstm_parallel_backward_data"
);
CHECK_SYNC
(
"hl_lstm_parallel_backward_data"
);
}
}
template
<
int
B_X
,
int
B_Y
>
template
<
int
B_X
,
int
B_Y
>
__global__
void
KeSetGradZero
(
real
*
gateGrad
,
__global__
void
KeSetGradZero
(
real
*
gateGrad
,
const
int
*
starts
,
int
valueSize
,
int
numSequences
,
bool
reversed
)
{
const
int
*
starts
,
int
valueSize
,
int
numSequences
,
bool
reversed
)
{
// const int tid = threadIdx.x;
// const int tid = threadIdx.x;
const
int
frameIdx
=
blockIdx
.
x
*
B_X
+
threadIdx
.
x
;
const
int
frameIdx
=
blockIdx
.
x
*
B_X
+
threadIdx
.
x
;
...
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
...
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
int
valueSize
=
4
*
frameSize
;
int
valueSize
=
4
*
frameSize
;
dim3
threads
(
32
,
32
);
dim3
threads
(
32
,
32
);
dim3
grid
((
valueSize
+
32
-
1
)
/
32
,
(
numSequences
+
32
-
1
)
/
32
);
dim3
grid
((
valueSize
+
32
-
1
)
/
32
,
(
numSequences
+
32
-
1
)
/
32
);
KeSetGradZero
<
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSetGradZero
<
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
gateGrad
,
sequence
,
valueSize
,
numSequences
,
reversed
);
gateGrad
,
sequence
,
valueSize
,
numSequences
,
reversed
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
hl_matrix_mul
(
outputValue
,
hl_matrix_mul
(
outputValue
,
HPPL_OP_T
,
gateGrad
+
valueSize
,
HPPL_OP_N
,
weightGrad
,
HPPL_OP_T
,
frameSize
,
valueSize
,
batchSize
-
1
,
gateGrad
+
valueSize
,
1.0
,
1.0
);
HPPL_OP_N
,
weightGrad
,
frameSize
,
valueSize
,
batchSize
-
1
,
1.0
,
1.0
);
}
else
{
}
else
{
hl_matrix_mul
(
outputValue
+
frameSize
,
hl_matrix_mul
(
outputValue
+
frameSize
,
HPPL_OP_T
,
gateGrad
,
HPPL_OP_N
,
weightGrad
,
HPPL_OP_T
,
frameSize
,
valueSize
,
batchSize
-
1
,
gateGrad
,
1.0
,
1.0
);
HPPL_OP_N
,
weightGrad
,
frameSize
,
valueSize
,
batchSize
-
1
,
1.0
,
1.0
);
}
}
CHECK_SYNC
(
"hl_lstm_parallel_backward_weight"
);
CHECK_SYNC
(
"hl_lstm_parallel_backward_weight"
);
}
}
paddle/cuda/src/hl_cuda_matrix.cu
浏览文件 @
59a8ebc6
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h"
#include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h"
#include "hl_sequence.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
void
hl_matrix_add
(
real
*
A_d
,
void
hl_matrix_add
(
real
*
A_d
,
real
*
B_d
,
real
*
B_d
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
real
alpha
,
real
alpha
,
...
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
...
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_gpu_apply_ternary_op
hl_gpu_apply_ternary_op
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
ternary
::
_add
<
real
>
(
alpha
,
beta
),
ternary
::
_add
<
real
>
(
alpha
,
beta
),
A_d
,
A_d
,
B_d
,
B_d
,
C_d
,
C_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
);
dimN
);
CHECK_SYNC
(
"hl_matrix_add failed"
);
CHECK_SYNC
(
"hl_matrix_add failed"
);
}
}
#ifdef PADDLE_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD
128
#define THRESHOLD
128
#else
#else
#define THRESHOLD
64
#define THRESHOLD
64
#endif
#endif
__device__
__forceinline__
__device__
__forceinline__
void
findMax
(
real
*
I
,
void
findMax
(
real
*
I
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
,
int
dimN
,
real
*
max
)
{
real
*
max
)
{
dfMax_s
[
base
]
=
-
1.0e20
;
dfMax_s
[
base
]
=
-
1.0e20
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
if
(
dfMax_s
[
base
]
<
I
[
nextIdx
])
{
if
(
dfMax_s
[
base
]
<
I
[
nextIdx
])
{
...
@@ -78,25 +76,24 @@ void findMax(real* I,
...
@@ -78,25 +76,24 @@ void findMax(real* I,
if
(
base
<
stride
)
{
if
(
base
<
stride
)
{
nextIdx
=
base
+
stride
;
nextIdx
=
base
+
stride
;
if
(
dfMax_s
[
base
]
<
dfMax_s
[
nextIdx
])
{
if
(
dfMax_s
[
base
]
<
dfMax_s
[
nextIdx
])
{
dfMax_s
[
base
]
=
dfMax_s
[
nextIdx
];
dfMax_s
[
base
]
=
dfMax_s
[
nextIdx
];
}
}
}
}
}
}
if
(
0
==
base
)
{
if
(
0
==
base
)
{
max
[
0
]
=
dfMax_s
[
0
];
max
[
0
]
=
dfMax_s
[
0
];
}
}
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
subMaxAndExp
(
real
*
I
,
void
subMaxAndExp
(
real
*
I
,
real
*
O
,
real
*
O
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
blockSize
,
int
blockSize
,
int
dimN
,
int
dimN
,
real
max
)
{
real
max
)
{
real
val
;
real
val
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
val
=
I
[
nextIdx
]
-
max
;
val
=
I
[
nextIdx
]
-
max
;
...
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
...
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
valueSum
(
real
*
O
,
void
valueSum
(
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
)
{
int
dimN
)
{
dfMax_s
[
base
]
=
0
;
dfMax_s
[
base
]
=
0
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
dfMax_s
[
base
]
+=
O
[
nextIdx
];
dfMax_s
[
base
]
+=
O
[
nextIdx
];
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
divSum
(
void
divSum
(
real
*
O
,
real
*
O
,
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
O
[
nextIdx
]
/=
sum
;
O
[
nextIdx
]
/=
sum
;
nextIdx
+=
blockSize
;
nextIdx
+=
blockSize
;
...
@@ -155,20 +146,18 @@ void divSum(real* O,
...
@@ -155,20 +146,18 @@ void divSum(real* O,
}
}
}
}
__device__
__forceinline__
__device__
__forceinline__
void
softmax
(
real
*
I
,
void
softmax
(
real
*
I
,
real
*
O
,
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
)
{
int
dimN
)
{
__shared__
real
max
;
__shared__
real
max
;
// find the max number
// find the max number
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
,
&
max
);
nextIdx
,
dimN
,
&
max
);
// sub max Value and do Exp operation
// sub max Value and do Exp operation
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
...
@@ -181,8 +170,8 @@ void softmax(real* I,
...
@@ -181,8 +170,8 @@ void softmax(real* I,
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
dimM
,
1
);
dim3
grid
(
dimM
,
1
);
KeMatrixSoftMax
<
512
>
KeMatrixSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_sequence_softmax_forward
(
real
*
A_d
,
void
hl_sequence_softmax_forward
(
real
*
A_d
,
real
*
C_d
,
real
*
C_d
,
const
int
*
index
,
const
int
*
index
,
int
numSequence
)
{
int
numSequence
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
numSequence
,
1
);
dim3
grid
(
numSequence
,
1
);
KeSequenceSoftMax
<
512
>
KeSequenceSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
}
}
__global__
void
KeMatrixDerivative
(
real
*
grad_d
,
__global__
void
KeMatrixDerivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
}
}
}
}
void
hl_matrix_softmax_derivative
(
real
*
grad_d
,
void
hl_matrix_softmax_derivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
entropy
,
real
*
output
,
real
*
entropy
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
}
}
int
*
row_col
=
col
+
row
[
index
];
int
*
row_col
=
col
+
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
}
}
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
row_idx
<
dimM
)
{
if
(
row_idx
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
i
;
int
index
=
row_idx
*
dimN
+
i
;
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
}
}
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
}
}
}
}
}
}
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
output
,
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
csr_mat
);
CHECK_NOTNULL
(
csr_mat
);
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
}
}
__global__
void
KeMatrixCrossEntropy
(
real
*
O
,
__global__
void
KeMatrixCrossEntropy
(
real
*
E
,
real
*
O
,
real
*
E
,
int
*
label
,
int
dimM
,
int
dimN
)
{
int
*
label
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
newBase
;
int
newBase
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
}
}
}
}
void
hl_matrix_cross_entropy
(
real
*
A_d
,
void
hl_matrix_cross_entropy
(
real
*
C_d
,
real
*
A_d
,
real
*
C_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
}
}
__global__
void
KeMatrixCrossEntropyBp
(
real
*
grad_d
,
__global__
void
KeMatrixCrossEntropyBp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
}
}
}
}
}
}
void
hl_matrix_cross_entropy_bp
(
real
*
grad_d
,
void
hl_matrix_cross_entropy_bp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
label_d
);
CHECK_NOTNULL
(
label_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
}
}
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
}
}
__global__
void
KeParamReluForward
(
real
*
output
,
__global__
void
KeParamReluForward
(
real
*
output
,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
tx
<
width
&&
ty
<
height
)
{
if
(
tx
<
width
&&
ty
<
height
)
{
int
index
=
ty
*
width
+
tx
;
int
index
=
ty
*
width
+
tx
;
output
[
index
]
=
input
[
index
]
>
0
?
input
[
index
]
:
output
[
index
]
=
input
[
index
]
*
w
[
tx
/
partial_sum
];
input
[
index
]
>
0
?
input
[
index
]
:
input
[
index
]
*
w
[
tx
/
partial_sum
];
}
}
}
}
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL
(
w
);
CHECK_NOTNULL
(
w
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
w
,
width
,
height
,
partial_sum
);
output
,
input
,
w
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
real
*
grad_o
,
real
*
grad_o
,
real
*
input
,
real
*
input
,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int
grid_num
=
width
/
partial_sum
;
int
grid_num
=
width
/
partial_sum
;
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
grid_num
,
1
);
dim3
grid
(
grid_num
,
1
);
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
}
}
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL
(
diff
);
CHECK_NOTNULL
(
diff
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
}
}
__global__
void
KeMatrixAddSharedBias
(
real
*
A
,
__global__
void
KeMatrixAddSharedBias
(
real
*
B
,
real
*
A
,
real
*
B
,
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dim
=
N
/
channel
;
int
dim
=
N
/
channel
;
if
(
index
<
M
*
N
)
{
if
(
index
<
M
*
N
)
{
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real
scale
)
{
real
scale
)
{
const
int
blocks
=
512
;
const
int
blocks
=
512
;
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
real
*
A
,
real
*
A
,
const
int
channel
,
const
int
channel
,
const
int
M
,
const
int
M
,
const
int
N
,
const
int
N
,
...
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
...
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
int
n
=
j
*
blockSize
+
tid
;
int
n
=
j
*
blockSize
+
tid
;
int
m
=
n
/
dim
;
int
m
=
n
/
dim
;
int
w
=
n
%
dim
;
int
w
=
n
%
dim
;
smem
[
tid
]
=
(
m
<
M
&&
w
<
dim
)
?
A
[
m
*
N
+
bid
*
dim
+
w
]
:
0.0
;
smem
[
tid
]
=
(
m
<
M
&&
w
<
dim
)
?
A
[
m
*
N
+
bid
*
dim
+
w
]
:
0.0
;
__syncthreads
();
__syncthreads
();
simpleReduce
(
smem
,
tid
,
blockSize
);
simpleReduce
(
smem
,
tid
,
blockSize
);
sum
+=
smem
[
0
];
sum
+=
smem
[
0
];
...
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
...
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
const
int
limit
=
64
;
const
int
limit
=
64
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
KeMatrixCollectSharedBias
<
blocks
>
KeMatrixCollectSharedBias
<
blocks
><<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
(
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
}
}
__global__
void
keMatrixRotate
(
real
*
mat
,
real
*
matRot
,
__global__
void
keMatrixRotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
dimM
*
dimN
)
{
if
(
idx
<
dimM
*
dimN
)
{
int
i
=
idx
/
dimN
;
int
i
=
idx
/
dimN
;
int
j
=
idx
%
dimN
;
int
j
=
idx
%
dimN
;
if
(
clockWise
)
{
if
(
clockWise
)
{
matRot
[
j
*
dimM
+
i
]
=
mat
[(
dimM
-
i
-
1
)
*
dimN
+
j
];
matRot
[
j
*
dimM
+
i
]
=
mat
[(
dimM
-
i
-
1
)
*
dimN
+
j
];
}
else
{
}
else
{
matRot
[
j
*
dimM
+
i
]
=
mat
[
i
*
dimN
+
(
dimN
-
j
-
1
)];
matRot
[
j
*
dimM
+
i
]
=
mat
[
i
*
dimN
+
(
dimN
-
j
-
1
)];
}
}
}
}
}
}
void
hl_matrix_rotate
(
real
*
mat
,
real
*
matRot
,
void
hl_matrix_rotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
matRot
);
CHECK_NOTNULL
(
matRot
);
const
int
threads
=
512
;
const
int
threads
=
512
;
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
}
}
paddle/cuda/src/hl_cuda_sequence.cu
浏览文件 @
59a8ebc6
...
@@ -16,36 +16,36 @@ limitations under the License. */
...
@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
__global__
void
KeMaxSequenceForward
(
real
*
input
,
__global__
void
KeMaxSequenceForward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
int
dimIdx
=
threadIdx
.
x
;
int
dimIdx
=
threadIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
if
(
sequenceId
>=
numSequences
)
return
;
if
(
sequenceId
>=
numSequences
)
return
;
int
start
=
sequence
[
sequenceId
];
int
start
=
sequence
[
sequenceId
];
int
end
=
sequence
[
sequenceId
+
1
];
int
end
=
sequence
[
sequenceId
+
1
];
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
real
tmp
=
-
HL_FLOAT_MAX
;
real
tmp
=
-
HL_FLOAT_MAX
;
int
tmpId
=
-
1
;
int
tmpId
=
-
1
;
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
tmp
=
input
[
insId
*
dim
+
i
];
tmp
=
input
[
insId
*
dim
+
i
];
tmpId
=
insId
;
tmpId
=
insId
;
}
}
}
}
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
}
}
}
}
void
hl_max_sequence_forward
(
real
*
input
,
void
hl_max_sequence_forward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
CHECK_NOTNULL
(
input
);
CHECK_NOTNULL
(
input
);
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
}
}
__global__
void
KeMaxSequenceBackward
(
real
*
outputGrad
,
__global__
void
KeMaxSequenceBackward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
colIdx
=
idx
%
dim
;
int
colIdx
=
idx
%
dim
;
if
(
idx
<
numSequences
*
dim
)
{
if
(
idx
<
numSequences
*
dim
)
{
int
insId
=
index
[
idx
];
int
insId
=
index
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
}
}
}
}
void
hl_max_sequence_backward
(
real
*
outputGrad
,
void
hl_max_sequence_backward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
inputGrad
);
CHECK_NOTNULL
(
inputGrad
);
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
real
*
table
,
int
*
ids
,
int
*
ids
,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while
(
sampleId
<
numSamples
)
{
while
(
sampleId
<
numSamples
)
{
int
tableId
=
ids
[
sampleId
];
int
tableId
=
ids
[
sampleId
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
==
0
)
{
if
(
AddRow
==
0
)
{
outputData
[
i
]
+=
tableData
[
i
];
outputData
[
i
]
+=
tableData
[
i
];
...
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
}
}
}
}
}
}
sampleId
+=
blockDimY
*
gridDimX
;
sampleId
+=
blockDimY
*
gridDimX
;
}
}
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
seq2batch
,
bool
isAdd
>
template
<
int
blockDimX
,
__global__
int
blockDimY
,
void
KeSequence2Batch
(
real
*
batch
,
int
gridDimX
,
real
*
sequence
,
bool
seq2batch
,
const
int
*
batchIndex
,
bool
isAdd
>
int
seqWidth
,
__global__
void
KeSequence2Batch
(
real
*
batch
,
int
batchCount
)
{
real
*
sequence
,
const
int
*
batchIndex
,
int
seqWidth
,
int
batchCount
)
{
int
idx
=
threadIdx
.
x
;
int
idx
=
threadIdx
.
x
;
int
idy
=
threadIdx
.
y
;
int
idy
=
threadIdx
.
y
;
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
while
(
id
<
batchCount
)
{
while
(
id
<
batchCount
)
{
int
seqId
=
batchIndex
[
id
];
int
seqId
=
batchIndex
[
id
];
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
if
(
seq2batch
)
{
if
(
seq2batch
)
{
if
(
isAdd
)
{
if
(
isAdd
)
{
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
}
}
}
}
}
}
id
+=
blockDimY
*
gridDimX
;
id
+=
blockDimY
*
gridDimX
;
}
}
}
}
void
hl_sequence2batch_copy
(
real
*
batch
,
void
hl_sequence2batch_copy
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
batchIndex
,
const
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
}
}
void
hl_sequence2batch_add
(
real
*
batch
,
void
hl_sequence2batch_add
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
int
*
batchIndex
,
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
...
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
}
}
template
<
bool
normByTimes
,
bool
seq2batch
>
template
<
bool
normByTimes
,
bool
seq2batch
>
__global__
__global__
void
KeSequence2BatchPadding
(
real
*
batch
,
void
KeSequence2BatchPadding
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
sequenceStartPositions
,
const
int
*
sequenceStartPositions
,
const
size_t
sequenceWidth
,
const
size_t
sequenceWidth
,
const
size_t
maxSequenceLength
,
const
size_t
maxSequenceLength
,
const
size_t
numSequences
)
{
const
size_t
numSequences
)
{
int
batchIdx
=
blockIdx
.
y
;
int
batchIdx
=
blockIdx
.
y
;
int
sequenceStart
=
sequenceStartPositions
[
batchIdx
];
int
sequenceStart
=
sequenceStartPositions
[
batchIdx
];
int
sequenceLength
=
sequenceStartPositions
[
batchIdx
+
1
]
-
sequenceStart
;
int
sequenceLength
=
sequenceStartPositions
[
batchIdx
+
1
]
-
sequenceStart
;
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if
(
seq2batch
)
{
if
(
seq2batch
)
{
/* sequence -> batch */
/* sequence -> batch */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
else
{
}
else
{
/* batch -> sequence */
/* batch -> sequence */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
}
}
__device__
inline
float
my_rsqrt
(
float
x
)
{
__device__
inline
float
my_rsqrt
(
float
x
)
{
return
rsqrtf
(
x
);
}
return
rsqrtf
(
x
);
}
__device__
inline
double
my_rsqrt
(
double
x
)
{
__device__
inline
double
my_rsqrt
(
double
x
)
{
return
rsqrt
(
x
);
}
return
rsqrt
(
x
);
}
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
real
*
src
,
real
*
src
,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
sum
+=
src
[
i
*
width
+
col
];
sum
+=
src
[
i
*
width
+
col
];
}
}
sum
=
mode
==
1
?
sum
:
sum
=
mode
==
1
?
sum
:
(
mode
==
0
?
sum
/
seqLength
(
mode
==
0
?
sum
/
seqLength
:
sum
*
my_rsqrt
((
real
)
seqLength
));
:
sum
*
my_rsqrt
((
real
)
seqLength
));
dst
[
gid
]
+=
sum
;
dst
[
gid
]
+=
sum
;
}
}
}
}
...
@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
...
@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
int
grid
=
DIVUP
(
width
*
height
,
512
);
int
grid
=
DIVUP
(
width
*
height
,
512
);
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_forward!"
;
<<
"mode error in hl_sequence_avg_forward!"
;
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
}
}
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int
seqLength
=
end
-
start
;
int
seqLength
=
end
-
start
;
if
(
seqLength
==
0
)
return
;
if
(
seqLength
==
0
)
return
;
real
grad
=
src
[
gid
];
real
grad
=
src
[
gid
];
grad
=
mode
==
1
?
grad
:
grad
=
mode
==
1
?
grad
:
(
mode
==
0
?
grad
/
seqLength
(
mode
==
0
?
grad
/
seqLength
:
grad
*
my_rsqrt
((
real
)
seqLength
));
:
grad
*
my_rsqrt
((
real
)
seqLength
));
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
dst
[
i
*
width
+
col
]
+=
grad
;
dst
[
i
*
width
+
col
]
+=
grad
;
}
}
...
@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
...
@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
int
grid
=
DIVUP
(
width
*
height
,
512
);
int
grid
=
DIVUP
(
width
*
height
,
512
);
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_backward!"
;
<<
"mode error in hl_sequence_avg_backward!"
;
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
}
}
paddle/cuda/src/hl_cuda_sparse.cu
浏览文件 @
59a8ebc6
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_cuda_sparse.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sparse.h"
#include "hl_sparse.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_cuda_sparse.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
mul_scalar
,
ONE_PARAMETER
,
a
=
a
*
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
mul_scalar
,
ONE_PARAMETER
,
a
=
a
*
p
);
...
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
...
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
return
;
return
;
}
}
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
CHECK
((
A_d2
->
csr_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
CHECK
((
A_d2
->
csr_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
A_d2
->
csr_row
&&
A_d2
->
csr_row
&&
A_d2
->
csr_col
)
<<
"parameter transa error!"
;
A_d2
->
csr_col
)
<<
"parameter transa error!"
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
...
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
...
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsr2Dense
<
0
>
KeSMatrixCsr2Dense
<
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
KeSMatrixCsr2Dense
<
1
>
KeSMatrixCsr2Dense
<
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
}
}
CHECK_SYNC
(
"hl_matrix_csr2dense failed"
);
CHECK_SYNC
(
"hl_matrix_csr2dense failed"
);
...
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
CHECK
(
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
CHECK
(
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
return
;
return
;
}
}
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
CHECK
((
A_d2
->
csc_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
CHECK
((
A_d2
->
csc_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
A_d2
->
csc_row
&&
A_d2
->
csc_row
&&
A_d2
->
csc_col
)
<<
"parameter transa error!"
;
A_d2
->
csc_col
)
<<
"parameter transa error!"
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
...
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsc2Dense
<
0
>
KeSMatrixCsc2Dense
<
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
KeSMatrixCsc2Dense
<
1
>
KeSMatrixCsc2Dense
<
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
}
}
CHECK_SYNC
(
"hl_matrix_csc2dense failed"
);
CHECK_SYNC
(
"hl_matrix_csc2dense failed"
);
...
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
void
hl_malloc_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_malloc_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
CHECK
(
value_type
==
HL_FLOAT_VALUE
||
value_type
==
HL_NO_VALUE
)
CHECK
(
value_type
==
HL_FLOAT_VALUE
||
value_type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
/* avoid malloc 0 bytes */
/* avoid malloc 0 bytes */
int
nnz_s
=
(
nnz
==
0
?
1
:
nnz
);
int
nnz_s
=
(
nnz
==
0
?
1
:
nnz
);
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
csr
->
sparsity
=
-
1.0
;
csr
->
sparsity
=
-
1.0
;
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csr
->
csr_val
=
NULL
;
csr
->
csr_val
=
NULL
;
csr
->
nnz_s
=
nnz_s
;
csr
->
nnz_s
=
nnz_s
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
csr
->
nnz_s
=
nnz_s
;
csr
->
nnz_s
=
nnz_s
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
csr_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csr
->
csr_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
...
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
csc
->
sparsity
=
-
1.0
f
;
csc
->
sparsity
=
-
1.0
f
;
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csc
->
csc_val
=
NULL
;
csc
->
csc_val
=
NULL
;
csc
->
nnz_s
=
nnz_s
;
csc
->
nnz_s
=
nnz_s
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
csc
->
nnz_s
=
nnz_s
;
csc
->
nnz_s
=
nnz_s
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
csc_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csc
->
csc_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
...
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
void
hl_free_sparse_matrix
(
hl_sparse_matrix_s
A_d
)
{
void
hl_free_sparse_matrix
(
hl_sparse_matrix_s
A_d
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
||
A_d
->
format
==
HL_SPARSE_CSC
)
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
||
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
A_d
->
matrix
==
NULL
)
{
if
(
A_d
->
matrix
==
NULL
)
{
free
(
A_d
);
free
(
A_d
);
...
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
...
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
}
}
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
*
dest_d
,
void
*
dest_d
,
size_t
size
,
size_t
size
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
size_t
size_
=
(
dimM
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
size_t
size_
=
(
dimM
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
if
(
value_type
!=
HL_NO_VALUE
)
{
if
(
value_type
!=
HL_NO_VALUE
)
{
size_
+=
nnz
*
sizeof
(
real
);
size_
+=
nnz
*
sizeof
(
real
);
}
}
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csr
->
csr_val
=
NULL
;
csr
->
csr_val
=
NULL
;
csr
->
csr_row
=
(
int
*
)
dest_d
;
csr
->
csr_row
=
(
int
*
)
dest_d
;
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
(
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
(
dimM
+
1
)
*
sizeof
(
int
));
}
else
{
}
else
{
csr
->
csr_val
=
(
real
*
)
dest_d
;
csr
->
csr_val
=
(
real
*
)
dest_d
;
csr
->
csr_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csr
->
csr_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
)
+
nnz
*
sizeof
(
real
)
+
(
dimM
+
1
)
*
sizeof
(
int
));
(
dimM
+
1
)
*
sizeof
(
int
));
}
}
csr
->
nnz_s
=
nnz
;
csr
->
nnz_s
=
nnz
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
sparsity
=
-
1.0
;
csr
->
sparsity
=
-
1.0
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
size_t
size_
=
(
dimN
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
size_t
size_
=
(
dimN
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
if
(
value_type
!=
HL_NO_VALUE
)
{
if
(
value_type
!=
HL_NO_VALUE
)
{
size_
+=
nnz
*
sizeof
(
real
);
size_
+=
nnz
*
sizeof
(
real
);
}
}
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csc
->
csc_val
=
NULL
;
csc
->
csc_val
=
NULL
;
csc
->
csc_col
=
(
int
*
)
dest_d
;
csc
->
csc_col
=
(
int
*
)
dest_d
;
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
(
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
(
dimN
+
1
)
*
sizeof
(
int
));
}
else
{
}
else
{
csc
->
csc_val
=
(
real
*
)
dest_d
;
csc
->
csc_val
=
(
real
*
)
dest_d
;
csc
->
csc_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csc
->
csc_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
)
+
nnz
*
sizeof
(
real
)
+
(
dimN
+
1
)
*
sizeof
(
int
));
(
dimN
+
1
)
*
sizeof
(
int
));
}
}
csc
->
nnz_s
=
nnz
;
csc
->
nnz_s
=
nnz
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
sparsity
=
-
1.0
f
;
csc
->
sparsity
=
-
1.0
f
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
...
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
}
}
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
real
*
value_d
,
real
*
value_d
,
int
*
rows_d
,
int
*
rows_d
,
int
*
cols_d
,
int
*
cols_d
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
...
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
...
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
...
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
...
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK_NOTNULL
(
csr_matrix
);
CHECK_NOTNULL
(
csr_matrix
);
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
<<
"csr_matrix is not csr format!"
;
<<
"csr_matrix is not csr format!"
;
CHECK_NOTNULL
(
csr_matrix
->
matrix
);
CHECK_NOTNULL
(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
CHECK_LE
(
csr_matrix
->
nnz
,
csr
->
nnz_s
)
CHECK_LE
(
csr_matrix
->
nnz
,
csr
->
nnz_s
)
<<
"copy size "
<<
csr_matrix
->
nnz
<<
"copy size "
<<
csr_matrix
->
nnz
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csr
->
nnz_s
;
<<
csr
->
nnz_s
;
CHECK_LE
((
csr_matrix
->
rows
+
1
),
csr
->
row_s
)
CHECK_LE
((
csr_matrix
->
rows
+
1
),
csr
->
row_s
)
<<
"copy size "
<<
(
csr_matrix
->
rows
+
1
)
<<
"copy size "
<<
(
csr_matrix
->
rows
+
1
)
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csr
->
row_s
;
<<
csr
->
row_s
;
CHECK
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
||
CHECK
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
||
csr_matrix
->
type
==
HL_NO_VALUE
)
csr_matrix
->
type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
if
(
csr_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csr_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
if
(
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
}
else
if
(
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
hl_memcpy_async
(
csr
->
csr_row
,
hl_memcpy_async
(
csr_row
,
csr
->
csr_row
,
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_col
,
hl_memcpy_async
(
csr_col
,
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
}
}
...
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
...
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
if
(
csr_val
==
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
if
(
csr_val
==
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csr_val
!=
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
}
else
if
(
csr_val
!=
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
hl_memcpy_async
(
csr
->
csr_val
,
hl_memcpy_async
(
csr_val
,
csr
->
csr_val
,
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
}
else
if
(
csr_val
!=
NULL
&&
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
}
else
if
(
csr_val
!=
NULL
&&
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
hl_memcpy_async
(
csr
->
csr_val
,
hl_memcpy_async
(
csr_val
,
csr
->
csr_val
,
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
hl_memcpy_async
(
stream
);
csr
->
csr_row
,
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_row
,
hl_memcpy_async
(
csr_row
,
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
}
}
}
}
csr
->
sparsity
=
((
float
)
csr_matrix
->
nnz
)
/
csr
->
sparsity
=
((
float
)
csr_matrix
->
nnz
)
/
((
float
)
csr_matrix
->
rows
)
/
((
float
)
csr_matrix
->
rows
)
/
((
float
)
csr_matrix
->
cols
);
((
float
)
csr_matrix
->
cols
);
}
}
...
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
...
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK_NOTNULL
(
csc_matrix
);
CHECK_NOTNULL
(
csc_matrix
);
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
<<
"csc_matrix is not csc format error!"
;
<<
"csc_matrix is not csc format error!"
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
CHECK_LE
(
csc_matrix
->
nnz
,
csc
->
nnz_s
)
CHECK_LE
(
csc_matrix
->
nnz
,
csc
->
nnz_s
)
<<
"copy size "
<<
csc_matrix
->
nnz
<<
"copy size "
<<
csc_matrix
->
nnz
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csc
->
nnz_s
;
<<
csc
->
nnz_s
;
CHECK_LE
((
csc_matrix
->
cols
+
1
),
csc
->
col_s
)
CHECK_LE
((
csc_matrix
->
cols
+
1
),
csc
->
col_s
)
<<
"copy size "
<<
(
csc_matrix
->
cols
+
1
)
<<
"copy size "
<<
(
csc_matrix
->
cols
+
1
)
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csc
->
col_s
;
<<
csc
->
col_s
;
CHECK
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
||
CHECK
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
||
csc_matrix
->
type
==
HL_NO_VALUE
)
csc_matrix
->
type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
if
(
csc_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csc_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
if
(
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
}
else
if
(
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
hl_memcpy_async
(
csc
->
csc_row
,
hl_memcpy_async
(
csc_row
,
csc
->
csc_row
,
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
hl_memcpy_async
(
stream
);
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
}
}
...
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
...
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
if
(
csc_val
==
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
if
(
csc_val
==
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csc_val
!=
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
}
else
if
(
csc_val
!=
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
hl_memcpy_async
(
csc
->
csc_val
,
hl_memcpy_async
(
csc_val
,
csc
->
csc_val
,
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
}
else
if
(
csc_val
!=
NULL
&&
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
}
else
if
(
csc_val
!=
NULL
&&
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
hl_memcpy_async
(
csc
->
csc_val
,
hl_memcpy_async
(
csc_val
,
csc
->
csc_val
,
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
hl_memcpy_async
(
stream
);
csc
->
csc_row
,
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_row
,
hl_memcpy_async
(
csc_row
,
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
}
}
}
}
csc
->
sparsity
=
((
float
)
csc_matrix
->
nnz
)
/
csc
->
sparsity
=
((
float
)
csc_matrix
->
nnz
)
/
((
float
)
csc_matrix
->
rows
)
/
((
float
)
csc_matrix
->
rows
)
/
((
float
)
csc_matrix
->
cols
);
((
float
)
csc_matrix
->
cols
);
}
}
...
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
...
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
hl_sparse_matrix_s
src
,
hl_sparse_matrix_s
src
,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK
(
dst
&&
src
&&
dst
->
matrix
&&
src
->
matrix
)
CHECK
(
dst
&&
src
&&
dst
->
matrix
&&
src
->
matrix
)
<<
"parameter dst or src is null pointer!"
;
<<
"parameter dst or src is null pointer!"
;
CHECK_EQ
(
dst
->
format
,
src
->
format
)
CHECK_EQ
(
dst
->
format
,
src
->
format
)
<<
"sparse matrix format does not match!"
;
<<
"sparse matrix format does not match!"
;
CHECK
(
dst
->
type
!=
HL_FLOAT_VALUE
||
src
->
type
!=
HL_NO_VALUE
)
CHECK
(
dst
->
type
!=
HL_FLOAT_VALUE
||
src
->
type
!=
HL_NO_VALUE
)
<<
"src sparse matrix is no value, dst sparse matrix has value!"
;
<<
"src sparse matrix is no value, dst sparse matrix has value!"
;
if
(
dst
->
format
==
HL_SPARSE_CSR
)
{
if
(
dst
->
format
==
HL_SPARSE_CSR
)
{
dst
->
rows
=
src
->
rows
;
dst
->
rows
=
src
->
rows
;
dst
->
cols
=
src
->
cols
;
dst
->
cols
=
src
->
cols
;
dst
->
nnz
=
src
->
nnz
;
dst
->
nnz
=
src
->
nnz
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_memcpy_csr_matrix
(
dst
,
hl_memcpy_csr_matrix
(
dst
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
stream
);
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
stream
);
}
else
if
(
dst
->
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
dst
->
format
==
HL_SPARSE_CSC
)
{
dst
->
rows
=
src
->
rows
;
dst
->
rows
=
src
->
rows
;
dst
->
cols
=
src
->
cols
;
dst
->
cols
=
src
->
cols
;
dst
->
nnz
=
src
->
nnz
;
dst
->
nnz
=
src
->
nnz
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)
src
->
matrix
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)
src
->
matrix
;
hl_memcpy_csc_matrix
(
dst
,
hl_memcpy_csc_matrix
(
dst
,
csc
->
csc_val
,
csc
->
csc_row
,
csc
->
csc_col
,
stream
);
csc
->
csc_val
,
csc
->
csc_row
,
csc
->
csc_col
,
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"sparse matrix format error!"
;
LOG
(
FATAL
)
<<
"sparse matrix format error!"
;
}
}
...
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
...
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
if
(
beta
==
0.0
)
{
if
(
beta
==
0.0
)
{
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
c
,
dimM
,
dimN
,
dimN
);
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
c
,
dimM
,
dimN
,
dimN
);
}
else
{
}
else
{
if
(
beta
!=
1.0
){
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
c
,
dimM
,
dimN
,
dimN
);
unary
::
mul_scalar
<
real
>
(
beta
),
c
,
dimM
,
dimN
,
dimN
);
}
}
}
}
return
;
return
;
}
}
void
hl_matrix_csr_mul_dense
(
hl_sparse_matrix_s
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_csr_mul_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
real
*
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
if
((
HPPL_OP_N
==
transa
&&
(
A_d
->
rows
!=
dimM
||
A_d
->
cols
!=
dimK
))
||
if
((
HPPL_OP_N
==
transa
&&
(
A_d
->
rows
!=
dimM
||
A_d
->
cols
!=
dimK
))
||
(
HPPL_OP_T
==
transa
&&
(
A_d
->
rows
!=
dimK
||
A_d
->
cols
!=
dimM
)))
{
(
HPPL_OP_T
==
transa
&&
(
A_d
->
rows
!=
dimK
||
A_d
->
cols
!=
dimM
)))
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
...
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
if
((
A_d2
->
csr_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
if
((
A_d2
->
csr_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
A_d2
->
csr_row
==
NULL
||
A_d2
->
csr_row
==
NULL
||
A_d2
->
csr_col
==
NULL
)
{
A_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
...
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */
/* sparsity pattern */
// A_d->sparsity;
// A_d->sparsity;
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsrMulDense
<
0
>
KeSMatrixCsrMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCsrMulDense
<
1
>
KeSMatrixCsrMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
HPPL_OP_T
==
transa
)
{
}
else
if
(
HPPL_OP_T
==
transa
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
int
blocksX
=
CU_CSC_MUL_DENSE_BLOCK_N
;
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
int
blocksY
=
CU_CSC_MUL_DENSE_BLOCK_K
;
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCscMulDense
<
0
>
KeSMatrixCscMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCscMulDense
<
1
>
KeSMatrixCscMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
...
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_csr_mul_dense failed"
);
CHECK_SYNC
(
"hl_matrix_csr_mul_dense failed"
);
}
}
void
hl_matrix_dense_mul_csc
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_dense_mul_csc
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
LOG
(
FATAL
)
<<
"parameter dims error!"
;
LOG
(
FATAL
)
<<
"parameter dims error!"
;
}
}
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
B_d
->
nnz
==
0
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
...
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
B_d2
=
(
hl_csc_matrix
)(
B_d
->
matrix
);
hl_csc_matrix
B_d2
=
(
hl_csc_matrix
)(
B_d
->
matrix
);
if
((
B_d2
->
csc_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
if
((
B_d2
->
csc_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
B_d2
->
csc_row
==
NULL
||
B_d2
->
csc_row
==
NULL
||
B_d2
->
csc_col
==
NULL
)
{
B_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter B is null!"
;
LOG
(
FATAL
)
<<
"parameter B is null!"
;
}
}
...
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsc
<
0
>
KeSMatrixDenseMulCsc
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsc
<
1
>
KeSMatrixDenseMulCsc
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
transb
==
HPPL_OP_T
)
{
}
else
if
(
transb
==
HPPL_OP_T
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsr
<
0
>
KeSMatrixDenseMulCsr
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsr
<
1
>
KeSMatrixDenseMulCsr
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transb error!"
;
LOG
(
FATAL
)
<<
"parameter transb error!"
;
...
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_dense_mul_csc failed"
);
CHECK_SYNC
(
"hl_matrix_dense_mul_csc failed"
);
}
}
void
hl_matrix_dense_mul_csr
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_dense_mul_csr
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
if
(
dimM
<=
0
||
dimN
<=
0
||
dimK
<=
0
if
(
dimM
<=
0
||
dimN
<=
0
||
dimK
<=
0
||
||
(
transb
==
HPPL_OP_N
&&
(
B_d
->
rows
!=
dimK
||
B_d
->
cols
!=
dimN
))
(
transb
==
HPPL_OP_N
&&
(
B_d
->
rows
!=
dimK
||
B_d
->
cols
!=
dimN
))
||
||
(
transb
==
HPPL_OP_T
&&
(
B_d
->
rows
!=
dimN
||
B_d
->
cols
!=
dimK
)))
{
(
transb
==
HPPL_OP_T
&&
(
B_d
->
rows
!=
dimN
||
B_d
->
cols
!=
dimK
)))
{
LOG
(
FATAL
)
<<
"parameter dims error!"
;
LOG
(
FATAL
)
<<
"parameter dims error!"
;
}
}
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
B_d
->
nnz
==
0
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
...
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
B_d2
=
(
hl_csr_matrix
)(
B_d
->
matrix
);
hl_csr_matrix
B_d2
=
(
hl_csr_matrix
)(
B_d
->
matrix
);
if
((
B_d2
->
csr_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
if
((
B_d2
->
csr_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
B_d2
->
csr_row
==
NULL
||
B_d2
->
csr_row
==
NULL
||
B_d2
->
csr_col
==
NULL
)
{
B_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
}
}
if
(
transb
==
HPPL_OP_N
)
{
if
(
transb
==
HPPL_OP_N
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsr
<
0
>
KeSMatrixDenseMulCsr
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsr
<
1
>
KeSMatrixDenseMulCsr
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
transb
==
HPPL_OP_T
)
{
}
else
if
(
transb
==
HPPL_OP_T
)
{
int
blocksX
=
(
dimM
+
CU_CSCMM_BLOCK_M_BEST
-
1
)
/
CU_CSCMM_BLOCK_M_BEST
;
int
blocksX
=
(
dimM
+
CU_CSCMM_BLOCK_M_BEST
-
1
)
/
CU_CSCMM_BLOCK_M_BEST
;
...
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
dim3
threads
(
CU_CSCMM_THREAD_X_BEST
,
CU_CSCMM_THREAD_Y_BEST
);
dim3
threads
(
CU_CSCMM_THREAD_X_BEST
,
CU_CSCMM_THREAD_Y_BEST
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsc
<
0
>
KeSMatrixDenseMulCsc
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsc
<
1
>
KeSMatrixDenseMulCsc
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transb error!"
;
LOG
(
FATAL
)
<<
"parameter transb error!"
;
...
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_dense_mul_csr failed"
);
CHECK_SYNC
(
"hl_matrix_dense_mul_csr failed"
);
}
}
void
hl_matrix_csc_mul_dense
(
hl_sparse_matrix_s
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_csc_mul_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
real
*
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
if
((
A_d2
->
csc_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
if
((
A_d2
->
csc_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
A_d2
->
csc_row
==
NULL
||
A_d2
->
csc_row
==
NULL
||
A_d2
->
csc_col
==
NULL
)
{
A_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
HPPL_OP_N
==
transa
)
{
if
(
HPPL_OP_N
==
transa
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksX
=
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCscMulDense
<
0
>
KeSMatrixCscMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCscMulDense
<
1
>
KeSMatrixCscMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
HPPL_OP_T
==
transa
)
{
}
else
if
(
HPPL_OP_T
==
transa
)
{
int
blocksX
=
(
dimN
+
CU_CSRMM_BLOCK_N
-
1
)
/
CU_CSRMM_BLOCK_N
;
int
blocksX
=
(
dimN
+
CU_CSRMM_BLOCK_N
-
1
)
/
CU_CSRMM_BLOCK_N
;
...
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */
/* sparsity pattern */
// A_d->sparsity;
// A_d->sparsity;
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsrMulDense
<
0
>
KeSMatrixCsrMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCsrMulDense
<
1
>
KeSMatrixCsrMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
...
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_csc_mul_dense failed"
);
CHECK_SYNC
(
"hl_matrix_csc_mul_dense failed"
);
}
}
void
hl_sparse_matrix_mul
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_sparse_matrix_mul
(
real
*
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
C_d
,
real
*
B_d
,
int
dimM
,
int
dimN
,
int
dimK
,
hl_trans_op_t
transb
,
real
alpha
,
real
beta
)
{
hl_sparse_matrix_s
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
...
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
if
(
C_d
->
format
==
HL_SPARSE_CSC
)
{
if
(
C_d
->
format
==
HL_SPARSE_CSC
)
{
hl_csc_matrix
C_d2
=
(
hl_csc_matrix
)(
C_d
->
matrix
);
hl_csc_matrix
C_d2
=
(
hl_csc_matrix
)(
C_d
->
matrix
);
if
(
C_d2
->
csc_val
==
NULL
||
if
(
C_d2
->
csc_val
==
NULL
||
C_d2
->
csc_row
==
NULL
||
C_d2
->
csc_row
==
NULL
||
C_d2
->
csc_col
==
NULL
)
{
C_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
beta
!=
1.0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
hl_gpu_apply_unary_op
(
C_d2
->
csc_val
,
unary
::
mul_scalar
<
real
>
(
beta
),
C_d2
->
csc_val
,
1
,
C_d
->
nnz
,
C_d
->
nnz
);
1
,
C_d
->
nnz
,
C_d
->
nnz
);
}
}
int
blocksX
=
dimN
;
int
blocksX
=
dimN
;
...
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transB
=
transb
==
HPPL_OP_T
?
1
:
0
;
bool
transB
=
transb
==
HPPL_OP_T
?
1
:
0
;
KeSMatrixDenseMulDense2CSC
KeSMatrixDenseMulDense2CSC
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csc_val
,
C_d2
->
csc_val
,
C_d2
->
csc_row
,
C_d2
->
csc_row
,
C_d2
->
csc_col
,
C_d2
->
csc_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
else
{
}
else
{
hl_csr_matrix
C_d2
=
(
hl_csr_matrix
)(
C_d
->
matrix
);
hl_csr_matrix
C_d2
=
(
hl_csr_matrix
)(
C_d
->
matrix
);
if
((
C_d2
->
csr_val
==
NULL
&&
C_d
->
type
!=
HL_NO_VALUE
)
||
if
((
C_d2
->
csr_val
==
NULL
&&
C_d
->
type
!=
HL_NO_VALUE
)
||
C_d2
->
csr_row
==
NULL
||
C_d2
->
csr_row
==
NULL
||
C_d2
->
csr_col
==
NULL
)
{
C_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
beta
!=
1.0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
hl_gpu_apply_unary_op
(
C_d2
->
csr_val
,
unary
::
mul_scalar
<
real
>
(
beta
),
C_d2
->
csr_val
,
1
,
C_d
->
nnz
,
C_d
->
nnz
);
1
,
C_d
->
nnz
,
C_d
->
nnz
);
}
}
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
...
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3
threads
(
CU_CSCMM_DMD2CSR_THREAD_X
,
1
);
dim3
threads
(
CU_CSCMM_DMD2CSR_THREAD_X
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeSMatrixDenseMulDense2CSR
KeSMatrixDenseMulDense2CSR
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csr_val
,
C_d2
->
csr_val
,
C_d2
->
csr_row
,
C_d2
->
csr_row
,
C_d2
->
csr_col
,
C_d2
->
csr_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
else
{
}
else
{
CHECK
(
!
transA
)
<<
"Not supported A is trans and B is not trans!"
;
CHECK
(
!
transA
)
<<
"Not supported A is trans and B is not trans!"
;
...
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
avgNnzPerRow
=
avgNnzPerRow
>
0
?
avgNnzPerRow
:
1
;
avgNnzPerRow
=
avgNnzPerRow
>
0
?
avgNnzPerRow
:
1
;
int
gridx
=
DIVUP
(
avgNnzPerRow
,
CU_BLOCK_SIZE
);
int
gridx
=
DIVUP
(
avgNnzPerRow
,
CU_BLOCK_SIZE
);
dim3
grid
(
gridx
,
dimM
);
dim3
grid
(
gridx
,
dimM
);
KeSMatrixDenseMulDenseTrans2CSR
KeSMatrixDenseMulDenseTrans2CSR
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csr_val
,
C_d2
->
csr_val
,
C_d2
->
csr_row
,
C_d2
->
csr_row
,
C_d2
->
csr_col
,
C_d2
->
csr_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
}
}
}
}
}
...
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
...
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
CHECK_NOTNULL
(
csc_col
);
CHECK_NOTNULL
(
csc_col
);
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
<<
"csc_matrix is not csc format error!"
;
<<
"csc_matrix is not csc format error!"
;
if
(
csc_matrix
->
nnz
>
row_size
||
if
(
csc_matrix
->
nnz
>
row_size
||
csc_matrix
->
cols
+
1
>
static_cast
<
int
>
(
col_size
))
{
csc_matrix
->
cols
+
1
>
static_cast
<
int
>
(
col_size
))
{
...
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
...
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
}
}
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_memcpy_async
((
void
*
)
csc_row
,
hl_memcpy_async
((
void
*
)
csc_row
,
(
void
*
)
csc
->
csc_row
,
(
void
*
)
csc
->
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
stream
);
hl_memcpy_async
((
void
*
)
csc_col
,
hl_memcpy_async
((
void
*
)
csc_col
,
(
void
*
)
csc
->
csc_col
,
(
void
*
)
csc
->
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
stream
);
if
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csc_val
!=
NULL
)
{
if
(
csc_val
!=
NULL
)
{
CHECK_LE
(
csc_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
CHECK_LE
(
csc_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
hl_memcpy_async
((
void
*
)
csc_val
,
hl_memcpy_async
((
void
*
)
csc_val
,
(
void
*
)
csc
->
csc_val
,
(
void
*
)
csc
->
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
...
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
CHECK_NOTNULL
(
csr_row
);
CHECK_NOTNULL
(
csr_row
);
CHECK_NOTNULL
(
csr_col
);
CHECK_NOTNULL
(
csr_col
);
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
<<
"csr_matrix is not csr format error!"
;
<<
"csr_matrix is not csr format error!"
;
if
(
csr_matrix
->
nnz
>
col_size
||
if
(
csr_matrix
->
nnz
>
col_size
||
csr_matrix
->
rows
+
1
>
static_cast
<
int
>
(
row_size
))
{
csr_matrix
->
rows
+
1
>
static_cast
<
int
>
(
row_size
))
{
...
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
}
}
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_memcpy_async
((
void
*
)
csr_row
,
hl_memcpy_async
((
void
*
)
csr_row
,
(
void
*
)
csr
->
csr_row
,
(
void
*
)
csr
->
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
stream
);
hl_memcpy_async
((
void
*
)
csr_col
,
hl_memcpy_async
((
void
*
)
csr_col
,
(
void
*
)
csr
->
csr_col
,
(
void
*
)
csr
->
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
stream
);
if
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csr_val
!=
NULL
)
{
if
(
csr_val
!=
NULL
)
{
CHECK_LE
(
csr_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
CHECK_LE
(
csr_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
hl_memcpy_async
((
void
*
)
csr_val
,
hl_memcpy_async
((
void
*
)
csr_val
,
(
void
*
)
csr
->
csr_val
,
(
void
*
)
csr
->
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
...
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
}
}
}
}
void
hl_sparse_matrix_column_sum
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
void
hl_sparse_matrix_column_sum
(
int
dimN
,
real
scale
)
{
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
int
dimN
,
real
scale
)
{
if
(
B_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
B_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_column_sum
(
A_d
,
B_d
,
dimM
,
dimN
,
scale
);
hl_matrix_csr_column_sum
(
A_d
,
B_d
,
dimM
,
dimN
,
scale
);
}
else
{
}
else
{
...
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
...
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
}
}
}
}
void
hl_matrix_csr_column_sum
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
void
hl_matrix_csr_column_sum
(
int
dimM
,
int
dimN
,
real
scale
)
{
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
int
dimN
,
real
scale
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
...
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
CHECK_SYNC
(
"hl_matrix_csr_column_sum failed"
);
CHECK_SYNC
(
"hl_matrix_csr_column_sum failed"
);
}
}
void
hl_sparse_matrix_add_bias
(
hl_sparse_matrix_s
A_d
,
void
hl_sparse_matrix_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
real
scale
)
{
real
*
B_d
,
real
scale
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_add_bias
(
A_d
,
B_d
,
scale
);
hl_matrix_csr_add_bias
(
A_d
,
B_d
,
scale
);
}
else
{
}
else
{
...
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
...
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
}
}
}
}
void
hl_matrix_csr_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
void
hl_matrix_csr_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
real
scale
)
{
real
scale
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
...
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
CHECK_SYNC
(
"hl_sparse_matrix_add_bias failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_add_bias failed"
);
}
}
void
hl_sparse_matrix_add_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
int
dimM
,
void
hl_sparse_matrix_add_dense
(
hl_sparse_matrix_s
A_d
,
int
dimN
,
real
alpha
,
real
beta
)
{
real
*
B_d
,
int
dimM
,
int
dimN
,
real
alpha
,
real
beta
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_add_dense
(
A_d
,
B_d
,
dimM
,
dimN
,
alpha
,
beta
);
hl_matrix_csr_add_dense
(
A_d
,
B_d
,
dimM
,
dimN
,
alpha
,
beta
);
}
else
{
}
else
{
...
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
...
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
}
}
}
}
void
hl_matrix_csr_add_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
int
dimM
,
void
hl_matrix_csr_add_dense
(
hl_sparse_matrix_s
A_d
,
int
dimN
,
real
alpha
,
real
beta
)
{
real
*
B_d
,
int
dimM
,
int
dimN
,
real
alpha
,
real
beta
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
...
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
gridX
=
gridX
>
0
?
gridX
:
1
;
gridX
=
gridX
>
0
?
gridX
:
1
;
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
gridX
,
dimM
);
dim3
grid
(
gridX
,
dimM
);
KeSMatrixCsrAddDense
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
KeSMatrixCsrAddDense
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
B_d
,
alpha
,
beta
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
B_d
,
alpha
,
beta
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_sparse_matrix_add_dense failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_add_dense failed"
);
}
}
int
*
hl_sparse_matrix_get_rows
(
hl_sparse_matrix_s
sMat
)
{
int
*
hl_sparse_matrix_get_rows
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
row
);
__sparse_get_return__
(
sMat
,
row
);
}
}
int
*
hl_sparse_matrix_get_cols
(
hl_sparse_matrix_s
sMat
)
{
int
*
hl_sparse_matrix_get_cols
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
col
);
__sparse_get_return__
(
sMat
,
col
);
}
}
real
*
hl_sparse_matrix_get_value
(
hl_sparse_matrix_s
sMat
)
{
real
*
hl_sparse_matrix_get_value
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
val
);
__sparse_get_return__
(
sMat
,
val
);
}
}
paddle/cuda/src/hl_perturbation_util.cu
浏览文件 @
59a8ebc6
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <stdlib.h>
#include <stdlib.h>
#include "hl_cuda.h"
#include <cmath>
#include "hl_time.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh"
#include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
...
@@ -30,10 +29,16 @@ limitations under the License. */
...
@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation.
* centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image.
* sourceX, sourceY: output coordinates in the original image.
*/
*/
__device__
void
getTranformCoord
(
int
x
,
int
y
,
real
theta
,
real
scale
,
__device__
void
getTranformCoord
(
int
x
,
real
tgtCenter
,
real
imgCenter
,
int
y
,
real
centerR
,
real
centerC
,
real
theta
,
int
*
sourceX
,
int
*
sourceY
)
{
real
scale
,
real
tgtCenter
,
real
imgCenter
,
real
centerR
,
real
centerC
,
int
*
sourceX
,
int
*
sourceY
)
{
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
// compute coornidates in the rotated and scaled image
// compute coornidates in the rotated and scaled image
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang
* created by Wei Xu (genome), converted by Jiang Wang
*/
*/
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
real
*
targets
,
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
int
imgSize
,
int
tgtSize
,
const
int
channels
,
real
*
targets
,
int
samplingRate
,
const
real
*
thetas
,
int
imgSize
,
const
real
*
scales
,
const
int
*
centerRs
,
int
tgtSize
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
channels
,
int
samplingRate
,
const
real
*
thetas
,
const
real
*
scales
,
const
int
*
centerRs
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
numImages
)
{
const
int
numImages
)
{
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const
int
pxY
=
pxIdx
/
tgtSize
;
const
int
pxY
=
pxIdx
/
tgtSize
;
int
srcPxX
,
srcPxY
;
int
srcPxX
,
srcPxY
;
getTranformCoord
(
pxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
getTranformCoord
(
pxX
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
&
srcPxY
);
&
srcPxY
);
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
*
*
* created by Wei Xu
* created by Wei Xu
*/
*/
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
real
*&
gpuScaleRatio
,
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
int
*&
gpuCenterR
,
int
*&
gpuCenterC
,
real
*&
gpuScaleRatio
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
int
*&
gpuCenterR
,
real
scaleRatio
,
int
samplingRate
,
int
*&
gpuCenterC
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
real
scaleRatio
,
int
samplingRate
,
bool
isTrain
)
{
bool
isTrain
)
{
// The number of output samples.
// The number of output samples.
int
numPatches
=
numImages
*
samplingRate
;
int
numPatches
=
numImages
*
samplingRate
;
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
r_angle
[
i
]
=
r_angle
[
i
]
=
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
-
0.5
);
-
0.5
);
s_ratio
[
i
]
=
s_ratio
[
i
]
=
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
}
}
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int
pxY
=
int
pxY
=
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
-
sin
(
-
r_angle
[
i
]),
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
-
sin
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
real
x
=
pxX
-
imgCenter
;
real
x
=
pxX
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete
[]
center_c
;
delete
[]
center_c
;
}
}
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
imgSize
,
int
numImages
,
int
samplingRate
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
samplingRate
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuScaleRatio
,
const
real
*
gpuScaleRatio
,
const
int
*
gpuCenterR
,
const
int
*
gpuCenterR
,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3
threadsPerBlock
(
4
,
128
);
dim3
threadsPerBlock
(
4
,
128
);
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
(
images
,
(
images
,
target
,
imgSize
,
tgtSize
,
channels
,
samplingRate
,
target
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
imgSize
,
paddingValue
,
numImages
);
tgtSize
,
channels
,
samplingRate
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
paddingValue
,
numImages
);
hl_device_synchronize
();
hl_device_synchronize
();
}
}
void
hl_conv_random_disturb
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
imgSize
,
real
scaleRatio
,
real
rotateAngle
,
int
tgtSize
,
int
samplingRate
,
real
*
gpu_r_angle
,
int
channels
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
numImages
,
int
*
gpu_center_c
,
int
paddingValue
,
real
scaleRatio
,
bool
isTrain
,
real
*
targets
)
{
real
rotateAngle
,
int
samplingRate
,
real
*
gpu_r_angle
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
*
gpu_center_c
,
int
paddingValue
,
bool
isTrain
,
real
*
targets
)
{
// generate the random disturbance sequence and the sampling locations
// generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_center_c
,
numImages
,
imgSize
,
rotateAngle
,
gpu_s_ratio
,
scaleRatio
,
samplingRate
,
isTrain
);
gpu_center_r
,
gpu_center_c
,
hl_conv_random_disturb_with_params
(
numImages
,
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
imgSize
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
rotateAngle
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
scaleRatio
,
targets
);
samplingRate
,
isTrain
);
hl_conv_random_disturb_with_params
(
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
targets
);
}
}
paddle/cuda/src/hl_table_apply.cu
浏览文件 @
59a8ebc6
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
int
ldo
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while
(
idy
<
numSamples
)
{
while
(
idy
<
numSamples
)
{
int
tableId
=
ids
[
idy
];
int
tableId
=
ids
[
idy
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
out
=
output
+
idy
*
ldo
;
real
*
out
=
output
+
idy
*
ldo
;
real
*
tab
=
table
+
tableId
*
ldt
;
real
*
tab
=
table
+
tableId
*
ldt
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
)
{
if
(
AddRow
)
{
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
}
}
}
}
void
hl_matrix_select_rows
(
real
*
output
,
int
ldo
,
void
hl_matrix_select_rows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
}
}
void
hl_matrix_add_to_rows
(
real
*
table
,
int
ldt
,
void
hl_matrix_add_to_rows
(
real
*
table
,
real
*
input
,
int
ldi
,
int
ldt
,
real
*
input
,
int
ldi
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
}
}
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
__global__
void
KeVectorSelect
(
T
*
dst
,
int
sized
,
__global__
void
KeVectorSelect
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
while
(
idx
<
sizei
)
{
while
(
idx
<
sizei
)
{
int
index
=
ids
[
idx
];
int
index
=
ids
[
idx
];
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
}
}
template
<
class
T
>
template
<
class
T
>
void
hl_vector_select_from
(
T
*
dst
,
int
sized
,
void
hl_vector_select_from
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
ids
);
CHECK_NOTNULL
(
ids
);
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3
threads
(
512
,
1
);
dim3
threads
(
512
,
1
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
}
}
template
template
void
hl_vector_select_from
(
real
*
dst
,
void
hl_vector_select_from
(
real
*
dst
,
int
sized
,
int
sized
,
const
real
*
src
,
int
sizes
,
const
real
*
src
,
const
int
*
ids
,
int
sizei
);
int
sizes
,
template
const
int
*
ids
,
void
hl_vector_select_from
(
int
*
dst
,
int
sized
,
int
sizei
);
const
int
*
src
,
int
sizes
,
template
void
hl_vector_select_from
(
const
int
*
ids
,
int
sizei
);
int
*
dst
,
int
sized
,
const
int
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
);
paddle/cuda/src/hl_top_k.cu
浏览文件 @
59a8ebc6
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
// using namespace hppl;
// using namespace hppl;
struct
Pair
{
struct
Pair
{
__device__
__forceinline__
__device__
__forceinline__
Pair
()
{}
Pair
()
{}
__device__
__forceinline__
__device__
__forceinline__
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
__device__
__forceinline__
__device__
__forceinline__
void
set
(
real
value
,
int
id
)
{
void
set
(
real
value
,
int
id
)
{
v_
=
value
;
v_
=
value
;
id_
=
id
;
id_
=
id
;
}
}
__device__
__forceinline__
__device__
__forceinline__
void
operator
=
(
const
Pair
&
in
)
{
void
operator
=
(
const
Pair
&
in
)
{
v_
=
in
.
v_
;
v_
=
in
.
v_
;
id_
=
in
.
id_
;
id_
=
in
.
id_
;
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
real
value
)
const
{
bool
operator
<
(
const
real
value
)
const
{
return
(
v_
<
value
);
return
(
v_
<
value
);
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
Pair
&
in
)
const
{
bool
operator
<
(
const
Pair
&
in
)
const
{
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
>
(
const
Pair
&
in
)
const
{
bool
operator
>
(
const
Pair
&
in
)
const
{
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
}
}
...
@@ -58,8 +50,9 @@ struct Pair {
...
@@ -58,8 +50,9 @@ struct Pair {
int
id_
;
int
id_
;
};
};
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
void
addTo
(
Pair
topK
[],
const
Pair
&
p
,
int
beamSize
)
{
const
Pair
&
p
,
int
beamSize
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
beamSize
>
template
<
int
beamSize
>
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
int
beamSize
)
{
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
Pair
topK
[],
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
real
*
val
,
const
Pair
&
max
,
int
beamSize
)
{
int
*
col
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
real
*
src
,
int
beamSize
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
real
*
src
,
int
dim
,
const
int
tid
)
{
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
max
,
length
);
max
,
length
);
}
}
}
}
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
real
*
val
,
int
*
col
,
int
beamSize
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
real
*
val
,
int
dim
,
const
int
tid
)
{
int
*
col
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
getTopK
<
blockSize
>
(
max
,
length
);
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
max
,
length
);
}
}
}
}
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
blockReduce
(
Pair
*
shTopK
,
void
blockReduce
(
Pair
*
shTopK
,
int
*
maxId
,
Pair
topK
[],
int
*
maxId
,
real
**
topVal
,
int
**
topIds
,
Pair
topK
[],
int
&
beam
,
int
&
beamSize
,
real
**
topVal
,
const
int
tid
,
const
int
warp
)
{
int
**
topIds
,
int
&
beam
,
int
&
beamSize
,
const
int
tid
,
const
int
warp
)
{
while
(
true
)
{
while
(
true
)
{
__syncthreads
();
__syncthreads
();
if
(
tid
<
blockSize
/
2
)
{
if
(
tid
<
blockSize
/
2
)
{
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
}
}
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
)
{
int
beamSize
)
{
__shared__
Pair
shTopK
[
blockSize
];
__shared__
Pair
shTopK
[
blockSize
];
...
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
...
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
real
*
val
,
real
*
val
,
int
*
row
,
int
*
row
,
int
*
col
,
int
*
col
,
...
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
...
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
void
hl_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
}
}
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
hl_sparse_matrix_s
src
,
hl_sparse_matrix_s
src
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
csr
->
csr_col
==
NULL
)
{
csr
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter src is null!"
;
LOG
(
FATAL
)
<<
"parameter src is null!"
;
}
}
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
}
}
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
*
label
,
int
*
label
,
...
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
__syncthreads
();
__syncthreads
();
if
(
tid
==
0
)
{
if
(
tid
==
0
)
{
for
(
int
i
=
0
;
i
<
topkSize
;
i
++
)
{
for
(
int
i
=
0
;
i
<
topkSize
;
i
++
)
{
if
(
*--
topIds
==
label
[
blockIdx
.
x
])
{
if
(
*--
topIds
==
label
[
blockIdx
.
x
])
{
recResult
[
blockIdx
.
x
]
=
0
;
recResult
[
blockIdx
.
x
]
=
0
;
break
;
break
;
}
}
recResult
[
blockIdx
.
x
]
=
1.0
f
;
recResult
[
blockIdx
.
x
]
=
1.0
f
;
}
}
}
}
}
}
void
hl_matrix_classification_error
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_classification_error
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
int
dim
,
real
*
src
,
int
topkSize
,
int
lds
,
int
numSamples
,
int
dim
,
int
*
label
,
int
topkSize
,
real
*
recResult
)
{
int
numSamples
,
int
*
label
,
real
*
recResult
)
{
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopKClassificationError
<
5
,
256
>
KeMatrixTopKClassificationError
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
}
}
paddle/framework/attribute.proto
浏览文件 @
59a8ebc6
...
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
// Attribute Type for paddle's Op.
// Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different.
// Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto.
// The AttrType will be shared between AttrDesc and AttrProto.
enum
AttrType
{
enum
AttrType
{
INT
=
0
;
INT
=
0
;
FLOAT
=
1
;
FLOAT
=
1
;
STRING
=
2
;
STRING
=
2
;
INTS
=
3
;
INTS
=
3
;
FLOATS
=
4
;
FLOATS
=
4
;
STRINGS
=
5
;
STRINGS
=
5
;
}
}
\ No newline at end of file
paddle/framework/op_desc.proto
浏览文件 @
59a8ebc6
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attribute.proto"
;
import
"attribute.proto"
;
...
@@ -22,14 +22,14 @@ import "attribute.proto";
...
@@ -22,14 +22,14 @@ import "attribute.proto";
//
//
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message
AttrDesc
{
message
AttrDesc
{
required
string
name
=
1
;
required
string
name
=
1
;
required
AttrType
type
=
2
;
required
AttrType
type
=
2
;
optional
int32
i
=
3
;
optional
int32
i
=
3
;
optional
float
f
=
4
;
optional
float
f
=
4
;
optional
string
s
=
5
;
optional
string
s
=
5
;
repeated
int32
ints
=
6
;
repeated
int32
ints
=
6
;
repeated
float
floats
=
7
;
repeated
float
floats
=
7
;
repeated
string
strings
=
8
;
repeated
string
strings
=
8
;
};
};
// Protocol Message to describe an Operator.
// Protocol Message to describe an Operator.
...
@@ -42,15 +42,15 @@ message AttrDesc {
...
@@ -42,15 +42,15 @@ message AttrDesc {
// 3rd-party language can build this proto message and call
// 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message
OpDesc
{
message
OpDesc
{
// input names of this Operator.
// input names of this Operator.
repeated
string
inputs
=
1
;
repeated
string
inputs
=
1
;
// output names of this Operator.
// output names of this Operator.
repeated
string
outputs
=
2
;
repeated
string
outputs
=
2
;
// type of this Operator, such as "add", "sub", "fc".
// type of this Operator, such as "add", "sub", "fc".
required
string
type
=
3
;
required
string
type
=
3
;
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated
AttrDesc
attrs
=
4
;
repeated
AttrDesc
attrs
=
4
;
};
};
\ No newline at end of file
paddle/framework/op_proto.proto
浏览文件 @
59a8ebc6
...
@@ -15,10 +15,11 @@ limitations under the License. */
...
@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding.
// Protocol Message for 3rd-party language binding.
//
//
// Paddle Python package will use `OpProto` to generate op creation methods.
// Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message,
// The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer.
// then pass `OpDesc` to C++ side and create Op pointer.
//
//
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attribute.proto"
;
import
"attribute.proto"
;
...
@@ -26,89 +27,90 @@ import "attribute.proto";
...
@@ -26,89 +27,90 @@ import "attribute.proto";
// Attribute protocol message for 3rd-party language binding.
// Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type.
// It will store the Op support what attribute and what type.
message
AttrProto
{
message
AttrProto
{
// Supported attribute name. e.g. `scale` for cosine op.
// Supported attribute name. e.g. `scale` for cosine op.
required
string
name
=
1
;
required
string
name
=
1
;
// Supported attribute type.
// Supported attribute type.
required
AttrType
type
=
2
;
required
AttrType
type
=
2
;
// Supported attribute comments. It helps 3rd-party language generate doc-string.
// Supported attribute comments. It helps 3rd-party language generate
required
string
comment
=
3
;
// doc-string.
required
string
comment
=
3
;
// If that attribute is generated, it means the Paddle third language
// If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should
// binding has responsibility to fill that attribute. End-User should
// not set that attribute.
// not set that attribute.
optional
bool
generated
=
4
[
default
=
false
];
optional
bool
generated
=
4
[
default
=
false
];
}
}
// Input or output message for 3rd-party language binding.
// Input or output message for 3rd-party language binding.
// It contains parameter name and its comments.
// It contains parameter name and its comments.
message
VarProto
{
message
VarProto
{
// Input or output name in that op creation function.
// Input or output name in that op creation function.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required
string
name
=
1
;
required
string
name
=
1
;
// The comment for that input. It helps 3rd-party language generate doc-string.
// The comment for that input. It helps 3rd-party language generate
required
string
comment
=
2
;
// doc-string.
required
string
comment
=
2
;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or
// Is that input/output could be a list or not.
// `output_format`.
// If so, that Op should write a attributed named `input_format` or
//
// `output_format`.
// e.g.
//
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// e.g.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// will hold a attribute of them.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc
//
// will hold a attribute of them.
// The Op desc of same fc could be
//
// {
// The Op desc of same fc could be
// "type": "fc",
// {
// "input": ["X1", "X2", "W1", "W2", "b"],
// "type": "fc",
// "output": "fc.out",
// "input": ["X1", "X2", "W1", "W2", "b"],
// "attrs" : {
// "output": "fc.out",
// "input_format": [0, 2, 4, 5]
// "attrs" : {
// }
// "input_format": [0, 2, 4, 5]
// }
// }
//
// }
optional
bool
multiple
=
3
[
default
=
false
];
//
optional
bool
multiple
=
3
[
default
=
false
];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use
// It marks that output is a temporary output. That output is not used by
// that output, it could be optimized early.
// user, but used by other op internally as input. If other op is not use
//
// that output, it could be optimized early.
// Attribute temporary_index will be set in OpDesc if there is some
//
// outputs are temporary.
// Attribute temporary_index will be set in OpDesc if there is some
//
// outputs are temporary.
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
//
// attrs = {
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// "temporary_index": [1]
// attrs = {
// }
// "temporary_index": [1]
optional
bool
temporary
=
4
[
default
=
false
];
// }
optional
bool
temporary
=
4
[
default
=
false
];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
// The gradient of operator can be ignored immediately
// can be ignored for the future optimized on graph.
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
optional
bool
ignore_gradient
=
6
;
// can be ignored for the future optimized on graph.
optional
bool
ignore_gradient
=
6
;
}
}
// Op protocol message for 3rd-party language binding.
// Op protocol message for 3rd-party language binding.
// It contains all information for generating op creation method.
// It contains all information for generating op creation method.
message
OpProto
{
message
OpProto
{
// The input information to generate op creation method.
// The input information to generate op creation method.
repeated
VarProto
inputs
=
1
;
repeated
VarProto
inputs
=
1
;
// The output information to generate op creation method.
// The output information to generate op creation method.
repeated
VarProto
outputs
=
2
;
repeated
VarProto
outputs
=
2
;
// The attribute information to generate op creation method.
// The attribute information to generate op creation method.
repeated
AttrProto
attrs
=
3
;
repeated
AttrProto
attrs
=
3
;
// The comments for that Op. It helps 3rd-party language generate
// The comments for that Op. It helps 3rd-party language generate
// doc-string. The whole documentation of that Op is generated by comment,
// doc-string. The whole documentation of that Op is generated by comment,
// inputs, outputs, attrs together.
// inputs, outputs, attrs together.
required
string
comment
=
4
;
required
string
comment
=
4
;
// The type of that Op.
required
string
type
=
5
;
// The type of that Op.
required
string
type
=
5
;
}
}
paddle/framework/operator.cc
浏览文件 @
59a8ebc6
...
@@ -22,14 +22,14 @@ namespace framework {
...
@@ -22,14 +22,14 @@ namespace framework {
template
<
>
template
<
>
Eigen
::
DefaultDevice
&
ExecutionContext
::
GetEigenDevice
<
Eigen
::
DefaultDevice
&
ExecutionContext
::
GetEigenDevice
<
platform
::
CPUPlace
,
Eigen
::
DefaultDevice
>
()
const
{
platform
::
CPUPlace
,
Eigen
::
DefaultDevice
>
()
const
{
return
*
device_context_
.
get_eigen_device
<
Eigen
::
DefaultDevice
>
();
return
*
device_context_
->
get_eigen_device
<
Eigen
::
DefaultDevice
>
();
}
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
template
<
>
template
<
>
Eigen
::
GpuDevice
&
Eigen
::
GpuDevice
&
ExecutionContext
::
GetEigenDevice
<
platform
::
GPUPlace
,
Eigen
::
GpuDevice
>
()
const
{
ExecutionContext
::
GetEigenDevice
<
platform
::
GPUPlace
,
Eigen
::
GpuDevice
>
()
const
{
return
*
device_context_
.
get_eigen_device
<
Eigen
::
GpuDevice
>
();
return
*
device_context_
->
get_eigen_device
<
Eigen
::
GpuDevice
>
();
}
}
#endif
#endif
...
...
paddle/framework/operator.h
浏览文件 @
59a8ebc6
...
@@ -174,7 +174,11 @@ class OperatorContext {
...
@@ -174,7 +174,11 @@ class OperatorContext {
template
<
typename
T
>
template
<
typename
T
>
T
*
Output
(
const
size_t
index
)
const
{
T
*
Output
(
const
size_t
index
)
const
{
auto
var
=
OutputVar
(
index
);
auto
var
=
OutputVar
(
index
);
PADDLE_ENFORCE
(
var
!=
nullptr
,
"Output(%d) should not be nullptr"
,
index
);
PADDLE_ENFORCE
(
var
!=
nullptr
,
"Output(%d) not be nullptr, which means variable [%s] does not "
"exist in scope"
,
index
,
op_
.
outputs_
[
index
]);
return
var
->
GetMutable
<
T
>
();
return
var
->
GetMutable
<
T
>
();
}
}
...
@@ -252,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
...
@@ -252,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class
ExecutionContext
:
public
OperatorContext
{
class
ExecutionContext
:
public
OperatorContext
{
public:
public:
ExecutionContext
(
const
OperatorBase
*
op
,
const
Scope
&
scope
,
ExecutionContext
(
const
OperatorBase
*
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
)
const
platform
::
DeviceContext
*
device_context
)
:
OperatorContext
(
op
,
scope
),
device_context_
(
device_context
)
{}
:
OperatorContext
(
op
,
scope
),
device_context_
(
device_context
)
{}
template
<
typename
PlaceType
,
template
<
typename
PlaceType
,
...
@@ -260,9 +264,9 @@ class ExecutionContext : public OperatorContext {
...
@@ -260,9 +264,9 @@ class ExecutionContext : public OperatorContext {
typename
EigenDeviceConverter
<
PlaceType
>::
EigenDeviceType
>
typename
EigenDeviceConverter
<
PlaceType
>::
EigenDeviceType
>
DeviceType
&
GetEigenDevice
()
const
;
DeviceType
&
GetEigenDevice
()
const
;
platform
::
Place
GetPlace
()
const
{
return
device_context_
.
GetPlace
();
}
platform
::
Place
GetPlace
()
const
{
return
device_context_
->
GetPlace
();
}
const
platform
::
DeviceContext
&
device_context_
;
const
platform
::
DeviceContext
*
device_context_
;
};
};
class
OpKernel
{
class
OpKernel
{
...
@@ -311,7 +315,7 @@ class OperatorWithKernel : public OperatorBase {
...
@@ -311,7 +315,7 @@ class OperatorWithKernel : public OperatorBase {
void
Run
(
const
Scope
&
scope
,
void
Run
(
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
final
{
const
platform
::
DeviceContext
&
dev_ctx
)
const
final
{
auto
&
opKernel
=
AllOpKernels
().
at
(
type_
).
at
(
OpKernelKey
(
dev_ctx
));
auto
&
opKernel
=
AllOpKernels
().
at
(
type_
).
at
(
OpKernelKey
(
dev_ctx
));
opKernel
->
Compute
(
ExecutionContext
(
this
,
scope
,
dev_ctx
));
opKernel
->
Compute
(
ExecutionContext
(
this
,
scope
,
&
dev_ctx
));
}
}
static
std
::
unordered_map
<
std
::
string
/* op_type */
,
OpKernelMap
>&
static
std
::
unordered_map
<
std
::
string
/* op_type */
,
OpKernelMap
>&
...
...
paddle/framework/operator_test.cc
浏览文件 @
59a8ebc6
...
@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
...
@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
ASSERT_EQ
(
xs
[
2
],
"x2"
);
ASSERT_EQ
(
xs
[
2
],
"x2"
);
auto
inVar0
=
ctx
.
MultiInputVar
(
"xs"
);
auto
inVar0
=
ctx
.
MultiInputVar
(
"xs"
);
ASSERT_EQ
(
inVar0
.
size
(),
3
);
ASSERT_EQ
(
inVar0
.
size
(),
3
U
);
auto
intVar1
=
ctx
.
InputVar
(
"k"
);
auto
intVar1
=
ctx
.
InputVar
(
"k"
);
ASSERT_NE
(
intVar1
,
nullptr
);
ASSERT_NE
(
intVar1
,
nullptr
);
auto
outVar0
=
ctx
.
MultiOutputVar
(
"ys"
);
auto
outVar0
=
ctx
.
MultiOutputVar
(
"ys"
);
ASSERT_EQ
(
outVar0
.
size
(),
2
);
ASSERT_EQ
(
outVar0
.
size
(),
2
U
);
auto
inTensor0
=
ctx
.
MultiInput
<
Tensor
>
(
"xs"
);
auto
inTensor0
=
ctx
.
MultiInput
<
Tensor
>
(
"xs"
);
ASSERT_EQ
(
inTensor0
.
size
(),
3
);
ASSERT_EQ
(
inTensor0
.
size
(),
3
U
);
auto
intTensor1
=
ctx
.
Input
<
Tensor
>
(
"k"
);
auto
intTensor1
=
ctx
.
Input
<
Tensor
>
(
"k"
);
ASSERT_NE
(
intTensor1
,
nullptr
);
ASSERT_NE
(
intTensor1
,
nullptr
);
auto
outTensor0
=
ctx
.
MultiOutput
<
Tensor
>
(
"ys"
);
auto
outTensor0
=
ctx
.
MultiOutput
<
Tensor
>
(
"ys"
);
ASSERT_EQ
(
outTensor0
.
size
(),
2
);
ASSERT_EQ
(
outTensor0
.
size
(),
2
U
);
auto
k
=
ctx
.
op_
.
Input
(
"k"
);
auto
k
=
ctx
.
op_
.
Input
(
"k"
);
ASSERT_EQ
(
k
,
"k0"
);
ASSERT_EQ
(
k
,
"k0"
);
...
...
paddle/function/BlockExpandOpTest.cpp
浏览文件 @
59a8ebc6
...
@@ -18,10 +18,10 @@ limitations under the License. */
...
@@ -18,10 +18,10 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
TEST
(
BlockExpandForward
,
real
)
{
TEST
(
BlockExpandForward
,
real
)
{
for
(
size_t
batchSize
:
{
5
,
32
})
{
for
(
size_t
batchSize
:
{
5
})
{
for
(
size_t
channels
:
{
1
,
5
,
32
})
{
for
(
size_t
channels
:
{
1
,
5
})
{
for
(
size_t
inputHeight
:
{
5
,
33
,
100
})
{
for
(
size_t
inputHeight
:
{
5
,
33
})
{
for
(
size_t
inputWidth
:
{
5
,
32
,
96
})
{
for
(
size_t
inputWidth
:
{
5
,
32
})
{
for
(
size_t
block
:
{
1
,
3
,
5
})
{
for
(
size_t
block
:
{
1
,
3
,
5
})
{
for
(
size_t
stride
:
{
1
,
2
})
{
for
(
size_t
stride
:
{
1
,
2
})
{
for
(
size_t
padding
:
{
0
,
1
})
{
for
(
size_t
padding
:
{
0
,
1
})
{
...
@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) {
...
@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) {
}
}
TEST
(
BlockExpandBackward
,
real
)
{
TEST
(
BlockExpandBackward
,
real
)
{
for
(
size_t
batchSize
:
{
5
,
32
})
{
for
(
size_t
batchSize
:
{
5
})
{
for
(
size_t
channels
:
{
1
,
5
,
32
})
{
for
(
size_t
channels
:
{
1
,
5
})
{
for
(
size_t
inputHeight
:
{
5
,
33
,
100
})
{
for
(
size_t
inputHeight
:
{
5
,
33
})
{
for
(
size_t
inputWidth
:
{
5
,
32
,
96
})
{
for
(
size_t
inputWidth
:
{
5
,
32
})
{
for
(
size_t
block
:
{
1
,
3
,
5
})
{
for
(
size_t
block
:
{
1
,
3
,
5
})
{
for
(
size_t
stride
:
{
1
,
2
})
{
for
(
size_t
stride
:
{
1
,
2
})
{
for
(
size_t
padding
:
{
0
,
1
})
{
for
(
size_t
padding
:
{
0
,
1
})
{
...
...
paddle/function/BufferArgTest.cpp
浏览文件 @
59a8ebc6
...
@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
...
@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
sizeOfValuType
(
VALUE_TYPE_INT32
));
sizeOfValuType
(
VALUE_TYPE_INT32
));
SequenceIdArg
buffer
(
memory
.
getBuf
(),
shape
);
SequenceIdArg
buffer
(
memory
.
getBuf
(),
shape
);
EXPECT_EQ
(
buffer
.
data
(),
memory
.
getBuf
());
EXPECT_EQ
(
buffer
.
data
(),
memory
.
getBuf
());
EXPECT_EQ
(
buffer
.
numSeqs
(),
9
);
EXPECT_EQ
(
buffer
.
numSeqs
(),
9
U
);
}
}
}
// namespace paddle
}
// namespace paddle
paddle/function/ContextProjectionOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
#include "ContextProjectionOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
}
else
if
((
i
+
context_start
)
>=
(
seq_end
-
seq_start
))
{
}
else
if
((
i
+
context_start
)
>=
(
seq_end
-
seq_start
))
{
if
(
padding
)
{
if
(
padding
)
{
value
=
value
=
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
input_dim
+
idx
];
input_dim
+
idx
];
}
else
{
}
else
{
continue
;
continue
;
}
}
...
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
real
*
output_r
=
real
*
output_r
=
output
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
output
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
output_r
[
idx
]
+=
value
;
output_r
[
idx
]
+=
value
;
if
(
j
-
outy
==
outx
)
break
;
if
(
j
-
outy
==
outx
)
break
;
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
if
(
weight
)
{
if
(
weight
)
{
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input
,
context_length
,
context_start
,
begin_pad
);
sequence
,
}
else
{
weight
,
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
output
,
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
context_length
,
context_start
,
begin_pad
);
}
else
{
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
input
,
sequence
,
weight
,
output
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
}
}
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
}
}
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
...
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
real
*
output_r
=
real
*
output_r
=
out
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
out
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
value
+=
output_r
[
idx
];
value
+=
output_r
[
idx
];
if
(
j
-
outy
==
outx
)
break
;
if
(
j
-
outy
==
outx
)
break
;
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int
blocks_y
=
1
;
int
blocks_y
=
1
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
}
}
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start
);
context_start
);
}
}
template
<
int
THREADS_X
,
int
THREADS_Y
>
template
<
int
THREADS_X
,
int
THREADS_Y
>
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
w_grad
,
real
*
w_grad
,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if
(
weight_idx
<
w_dim
)
{
if
(
weight_idx
<
w_dim
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
int
seq_start
=
sequence
[
seqId
];
int
seq_start
=
sequence
[
seqId
];
int
seq_end
=
sequence
[
seqId
+
1
];
int
seq_end
=
sequence
[
seqId
+
1
];
output_r
=
const_cast
<
real
*>
(
out_grad
)
output_r
=
+
seq_start
*
w_dim
*
context_length
;
const_cast
<
real
*>
(
out_grad
)
+
seq_start
*
w_dim
*
context_length
;
if
(
context_start
<
0
)
{
if
(
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
instanceId
=
padId
;
instanceId
=
padId
;
}
else
{
}
else
{
// begin_pad > 0;
// begin_pad > 0;
instanceId
=
(
padId
-
begin_pad
)
+
instanceId
=
(
seq_end
-
seq_start
)
-
context_start
;
(
padId
-
begin_pad
)
+
(
seq_end
-
seq_start
)
-
context_start
;
}
}
}
else
{
}
else
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
}
}
int
outx
=
(
instanceId
-
context_length
)
<
0
?
int
outx
=
instanceId
:
(
context_length
-
1
);
(
instanceId
-
context_length
)
<
0
?
instanceId
:
(
context_length
-
1
);
int
outy
=
(
instanceId
-
context_length
)
<
0
?
int
outy
=
(
instanceId
-
context_length
)
<
0
0
:
(
instanceId
-
(
context_length
-
1
));
?
0
:
(
instanceId
-
(
context_length
-
1
));
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
value
+=
output_r
[
weight_idx
];
value
+=
output_r
[
weight_idx
];
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
idy
<
stride
)
{
if
(
idy
<
stride
)
{
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
}
}
...
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
...
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3
threads
(
threads_x
,
threads_y
);
dim3
threads
(
threads_x
,
threads_y
);
dim3
grid
(
blocks_x
,
1
);
dim3
grid
(
blocks_x
,
1
);
KeContextProjectionBackwardWeight
<
32
,
32
>
KeContextProjectionBackwardWeight
<
32
,
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
w_grad
,
num_sequences
,
w_dim
,
out_grad
,
context_length
,
context_start
,
begin_pad
);
sequence
,
w_grad
,
num_sequences
,
w_dim
,
context_length
,
context_start
,
begin_pad
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
}
}
template
<
>
template
<
>
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
out_grad
,
const
GpuMatrix
&
out_grad
,
GpuMatrix
&
w_grad
,
GpuMatrix
&
w_grad
,
const
GpuIVector
&
seq_vec
,
const
GpuIVector
&
seq_vec
,
size_t
context_length
,
size_t
context_length
,
int
context_start
,
int
context_start
,
size_t
total_pad
,
size_t
total_pad
,
size_t
begin_pad
)
{
size_t
begin_pad
)
{
hl_context_projection_backward_weight
(
out_grad
.
getData
(),
hl_context_projection_backward_weight
(
out_grad
.
getData
(),
seq_vec
.
getData
(),
seq_vec
.
getData
(),
w_grad
.
getData
(),
w_grad
.
getData
(),
...
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t
begin_pad
,
size_t
begin_pad
,
bool
is_padding
,
bool
is_padding
,
size_t
total_pad
)
{
size_t
total_pad
)
{
if
(
in_grad
)
{
if
(
in_grad
)
{
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
out_grad
,
out_grad
,
in_grad
,
sequence
,
context_length
,
context_start
);
in_grad
,
}
sequence
,
if
(
is_padding
&&
w_grad
)
{
context_length
,
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
out_grad
,
context_start
);
w_grad
,
}
sequence
,
if
(
is_padding
&&
w_grad
)
{
context_length
,
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
context_start
,
out_grad
,
total_pad
,
w_grad
,
begin_pad
);
sequence
,
context_length
,
context_start
,
total_pad
,
begin_pad
);
}
}
}
}
...
...
paddle/function/CosSimOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace
paddle
{
namespace
paddle
{
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSim
(
real
*
output
,
__global__
void
KeCosSim
(
real
*
output
,
const
real
*
input1
,
const
real
*
input1
,
const
real
*
input2
,
const
real
*
input2
,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossim failed"
);
CHECK_SYNC
(
"hlCossim failed"
);
}
}
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
}
}
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
const
real
*
output
,
const
real
*
output
,
const
real
*
prev_out_x
,
const
real
*
prev_out_x
,
...
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
if
(
xy
[
0
]
==
0
)
{
if
(
xy
[
0
]
==
0
)
{
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
prev_grad_x
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
prev_grad_y
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
paddle
::
paddleAtomicAdd
(
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
prev_grad_y
+
index
,
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
}
}
}
}
}
else
{
}
else
{
...
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_x
[
index
]
+=
(
prev_out_y
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_y
[
index
]
*
reciprocalXY
-
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_y
[
index
]
+=
(
prev_out_x
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
output
[
ty
]
*
grad
[
ty
]
*
paddle
::
paddleAtomicAdd
(
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_grad_y
+
index
,
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
}
}
}
}
}
}
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const
int
block_size
=
256
;
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad
,
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
grad
,
input1_height
,
input2_height
,
scale
);
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
}
}
...
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real
scale
)
{
real
scale
)
{
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
&&
&&
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
<<
"Matrix types are not equally GPU"
;
<<
"Matrix types are not equally GPU"
;
size_t
dim
=
in1_val
.
getWidth
();
size_t
dim
=
in1_val
.
getWidth
();
const
real
*
grad
=
out_grad
.
getData
();
const
real
*
grad
=
out_grad
.
getData
();
...
...
paddle/function/CropOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CropOp.h"
#include "CropOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCrop
(
real
*
outputs
,
const
real
*
inputs
,
__global__
void
KeCrop
(
real
*
outputs
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
inputs
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
outW
;
const
int
w
=
idx
%
outW
;
...
@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
...
@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
template
<
>
template
<
>
void
Crop
<
DEVICE_TYPE_GPU
>
(
real
*
outputs
,
void
Crop
<
DEVICE_TYPE_GPU
>
(
real
*
outputs
,
const
real
*
inputs
,
const
real
*
inputs
,
const
TensorShape
inShape
,
const
TensorShape
inShape
,
const
TensorShape
outShape
,
const
TensorShape
outShape
,
const
FuncConfig
&
conf
)
{
const
FuncConfig
&
conf
)
{
std
::
vector
<
uint32_t
>
crop_corner
=
std
::
vector
<
uint32_t
>
crop_corner
=
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
int
cropC
=
crop_corner
[
1
];
int
cropC
=
crop_corner
[
1
];
int
cropH
=
crop_corner
[
2
];
int
cropH
=
crop_corner
[
2
];
int
cropW
=
crop_corner
[
3
];
int
cropW
=
crop_corner
[
3
];
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
outputs
,
(
outputs
,
inputs
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
inputs
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"Crop"
);
CHECK_SYNC
(
"Crop"
);
}
}
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
real
*
outGrad
,
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
int
inC
,
int
inH
,
int
inW
,
real
*
outGrad
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
...
@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
template
<
>
template
<
>
void
CropGrad
<
DEVICE_TYPE_GPU
>
(
const
real
*
inGrad
,
void
CropGrad
<
DEVICE_TYPE_GPU
>
(
const
real
*
inGrad
,
real
*
outGrad
,
real
*
outGrad
,
const
TensorShape
inShape
,
const
TensorShape
inShape
,
const
TensorShape
outShape
,
const
TensorShape
outShape
,
const
FuncConfig
&
conf
)
{
const
FuncConfig
&
conf
)
{
std
::
vector
<
uint32_t
>
crop_corner
=
std
::
vector
<
uint32_t
>
crop_corner
=
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
int
cropC
=
crop_corner
[
1
];
int
cropC
=
crop_corner
[
1
];
int
cropH
=
crop_corner
[
2
];
int
cropH
=
crop_corner
[
2
];
int
cropW
=
crop_corner
[
3
];
int
cropW
=
crop_corner
[
3
];
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
(
inGrad
,
outGrad
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outGrad
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"CropGrad"
);
CHECK_SYNC
(
"CropGrad"
);
}
}
...
...
paddle/function/CrossMapNormalOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h"
#include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
const
real
*
in
,
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
real
*
scale
,
size_t
channels
,
const
real
*
in
,
size_t
height
,
size_t
width
,
size_t
size
,
real
*
scale
,
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
alpha
)
{
real
alpha
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
}
}
}
}
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
in
,
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
scale
,
real
negative_beta
,
const
real
*
in
,
const
real
*
scale
,
real
negative_beta
,
real
*
out
)
{
real
*
out
)
{
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
index
<
inputSize
)
{
if
(
index
<
inputSize
)
{
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t
imageSize
=
numSamples
*
height
*
width
;
size_t
imageSize
=
numSamples
*
height
*
width
;
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
blockSize
=
1024
;
blockSize
=
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
CHECK_SYNC
(
"CrossMapNormal"
);
CHECK_SYNC
(
"CrossMapNormal"
);
}
}
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
bottom_data
,
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
top_data
,
const
real
*
scale
,
const
real
*
bottom_data
,
const
real
*
top_diff
,
size_t
channels
,
const
real
*
top_data
,
size_t
height
,
size_t
width
,
size_t
size
,
const
real
*
scale
,
real
negative_beta
,
real
cache_ratio
,
const
real
*
top_diff
,
real
*
bottom_diff
)
{
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
negative_beta
,
real
cache_ratio
,
real
*
bottom_diff
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
const
int
w
=
idx
%
width
;
const
int
w
=
idx
%
width
;
...
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
...
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
while
(
index
<
channels
+
post_pad
)
{
while
(
index
<
channels
+
post_pad
)
{
if
(
index
<
channels
)
{
if
(
index
<
channels
)
{
accum
+=
top_diff
[
index
*
step
]
*
top_data
[
index
*
step
]
/
accum
+=
top_diff
[
index
*
step
]
*
top_data
[
index
*
step
]
/
scale
[
index
*
step
];
scale
[
index
*
step
];
}
}
if
(
index
>=
size
)
{
if
(
index
>=
size
)
{
accum
-=
top_diff
[(
index
-
size
)
*
step
]
*
accum
-=
top_diff
[(
index
-
size
)
*
step
]
*
top_data
[(
index
-
size
)
*
step
]
/
scale
[(
index
-
size
)
*
step
];
top_data
[(
index
-
size
)
*
step
]
/
scale
[(
index
-
size
)
*
step
];
}
}
if
(
index
>=
post_pad
)
{
if
(
index
>=
post_pad
)
{
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
top_diff
[(
index
-
post_pad
)
*
step
]
*
top_diff
[(
index
-
post_pad
)
*
step
]
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
cache_ratio
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
cache_ratio
*
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
}
}
++
index
;
++
index
;
}
}
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
imageSize
,
(
imageSize
,
inputsValue
,
outputsValue
,
denoms
,
outputsGrad
,
channels
,
inputsValue
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
outputsValue
,
denoms
,
outputsGrad
,
channels
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
}
}
...
...
paddle/function/CrossMapNormalOpTest.cpp
浏览文件 @
59a8ebc6
...
@@ -18,11 +18,11 @@ limitations under the License. */
...
@@ -18,11 +18,11 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
TEST
(
CrossMapNormal
,
real
)
{
TEST
(
CrossMapNormal
,
real
)
{
for
(
size_t
numSamples
:
{
5
,
32
})
{
for
(
size_t
numSamples
:
{
5
})
{
for
(
size_t
channels
:
{
1
,
5
,
32
})
{
for
(
size_t
channels
:
{
1
,
5
})
{
for
(
size_t
imgSizeH
:
{
5
,
33
,
100
})
{
for
(
size_t
imgSizeH
:
{
5
,
33
})
{
for
(
size_t
imgSizeW
:
{
5
,
32
,
96
})
{
for
(
size_t
imgSizeW
:
{
5
,
32
})
{
for
(
size_t
size
:
{
1
,
2
,
3
,
5
,
7
})
{
for
(
size_t
size
:
{
1
,
3
})
{
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
<<
" size="
<<
size
;
<<
" size="
<<
size
;
...
@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) {
...
@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) {
}
}
TEST
(
CrossMapNormalGrad
,
real
)
{
TEST
(
CrossMapNormalGrad
,
real
)
{
for
(
size_t
numSamples
:
{
5
,
32
})
{
for
(
size_t
numSamples
:
{
5
})
{
for
(
size_t
channels
:
{
1
,
5
,
32
})
{
for
(
size_t
channels
:
{
1
,
5
})
{
for
(
size_t
imgSizeH
:
{
5
,
33
,
100
})
{
for
(
size_t
imgSizeH
:
{
5
,
33
})
{
for
(
size_t
imgSizeW
:
{
5
,
32
,
96
})
{
for
(
size_t
imgSizeW
:
{
5
,
32
})
{
for
(
size_t
size
:
{
1
,
2
,
3
,
5
,
7
})
{
for
(
size_t
size
:
{
1
,
3
})
{
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
<<
" size="
<<
size
;
<<
" size="
<<
size
;
...
...
paddle/function/DepthwiseConvOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -20,17 +20,25 @@ namespace paddle {
...
@@ -20,17 +20,25 @@ namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass
// CUDA kernel to compute the depthwise convolution forward pass
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
const
T
*
const
inputData
,
const
T
*
const
inputData
,
const
T
*
const
filterData
,
const
T
*
const
filterData
,
const
int
batchSize
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
batchSize
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
const
int
inputChannels
,
const
int
inputHeight
,
int
index
=
const
int
inputWidth
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
...
@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
&&
(
w_in_start
>=
0
)
&&
&&
(
w_in_start
>=
0
)
&&
(
w_in_end
<
inputWidth
))
{
(
w_in_end
<
inputWidth
))
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
value
+=
(
*
weight
)
*
inputData
[
offset
];
inputWidth
+
++
weight
;
w_in
;
}
value
+=
(
*
weight
)
*
inputData
[
offset
];
++
weight
;
}
}
}
}
else
{
}
else
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
value
+=
(
*
weight
)
*
inputData
[
offset
];
inputWidth
+
}
w_in
;
++
weight
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
}
}
}
++
weight
;
}
}
}
}
outputData
[
index
]
=
value
;
outputData
[
index
]
=
value
;
}
}
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
top_diff
,
const
T
*
const
weight_data
,
const
T
*
const
weight_data
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
num
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
const
int
inputChannels
,
int
index
=
const
int
inputHeight
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
...
@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
...
@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const
int
c_out_start
=
c_in
*
filterMultiplier
;
const
int
c_out_start
=
c_in
*
filterMultiplier
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
T
value
=
0
;
T
value
=
0
;
for
(
int
c_out
=
c_out_start
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
++
)
{
c_out
++
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
+
+
filter_h
*
filterWidth
+
filter_w
;
filter_h
*
filterWidth
+
filter_w
;
const
int
top_diff_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_diff_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
outputWidth
+
}
w_out
;
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
}
}
}
}
}
bottom_diff
[
index
]
+=
value
;
bottom_diff
[
index
]
+=
value
;
}
}
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
const
int
nthreads
,
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
inputData
,
const
T
*
const
top_diff
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
T
*
const
inputData
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
num
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputChannels
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputHeight
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
const
int
outputWidth
,
int
index
=
const
int
inputChannels
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputHeight
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
w_out
=
index
%
outputWidth
;
const
int
w_out
=
index
%
outputWidth
;
const
int
kh
=
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
const
int
kh
=
%
filterHeight
;
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
%
filterHeight
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
c_out
=
index
/
const
int
c_out
=
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
index
/
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
batch
=
num_i
;
const
int
batch
=
num_i
;
const
int
top_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
outputWidth
+
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
w_out
;
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
}
else
{
}
else
{
buffer_data
[
index
]
=
0
;
buffer_data
[
index
]
=
0
;
...
@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
...
@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
}
}
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
inputData
,
void
operator
()(
const
T
*
inputData
,
const
T
*
filterData
,
const
T
*
filterData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
outputData
)
{
T
*
outputData
)
{
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseForward
<
T
>
ConvolutionDepthwiseForward
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
outputSize
,
outputSize
,
inputData
,
inputData
,
filterData
,
filterData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
outputData
);
outputData
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
outputGrad
,
void
operator
()(
const
T
*
outputGrad
,
const
T
*
filterData
,
const
T
*
filterData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
inputGrad
)
{
T
*
inputGrad
)
{
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseInputBackward
<
T
>
ConvolutionDepthwiseInputBackward
<
T
>
// NOLINT_NEXT_LINE(whitespace/operators)
// NOLINT_NEXT_LINE(whitespace/operators)
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
inputSize
,
inputSize
,
outputGrad
,
outputGrad
,
filterData
,
filterData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
inputGrad
);
inputGrad
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
outputGrad
,
void
operator
()(
const
T
*
outputGrad
,
const
T
*
inputData
,
const
T
*
inputData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
colData
,
T
*
colData
,
T
*
filterGrad
)
{
T
*
filterGrad
)
{
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
*
*
outputHeight
*
outputWidth
;
outputHeight
*
outputWidth
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
1
,
filterGrad
,
false
,
true
);
1
,
filterGrad
,
false
,
true
);
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
ConvolutionDepthwiseFilterBackward
<
T
>
ConvolutionDepthwiseFilterBackward
<
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
i
,
i
,
colDataSize
,
colDataSize
,
outputGrad
,
outputGrad
,
inputData
,
inputData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
colData
);
colData
);
int
K
=
outputHeight
*
outputWidth
;
int
K
=
outputHeight
*
outputWidth
;
int
M
=
colDataSize
/
K
;
int
M
=
colDataSize
/
K
;
BaseMatrix
colMatrix
(
M
,
K
,
colData
,
false
,
true
);
BaseMatrix
colMatrix
(
M
,
K
,
colData
,
false
,
true
);
filterGradMatrix
.
sumRows
(
colMatrix
,
(
T
)
1.0
,
(
T
)
1.0
);
filterGradMatrix
.
sumRows
(
colMatrix
,
(
T
)
1.0
,
(
T
)
1.0
);
}
}
}
}
};
};
#ifdef PADDLE_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
...
...
paddle/function/FunctionTest.cpp
浏览文件 @
59a8ebc6
...
@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
...
@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
template
<
>
template
<
>
void
FunctionApi
<
DEVICE_TYPE_CPU
>
(
CpuMatrix
&
output
,
const
CpuMatrix
&
input
)
{
void
FunctionApi
<
DEVICE_TYPE_CPU
>
(
CpuMatrix
&
output
,
const
CpuMatrix
&
input
)
{
EXPECT_EQ
(
output
.
getHeight
(),
100
);
EXPECT_EQ
(
output
.
getHeight
(),
100
U
);
EXPECT_EQ
(
output
.
getWidth
(),
200
);
EXPECT_EQ
(
output
.
getWidth
(),
200
U
);
}
}
template
<
>
template
<
>
void
FunctionApi
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
output
,
const
GpuMatrix
&
input
)
{
void
FunctionApi
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
output
,
const
GpuMatrix
&
input
)
{
EXPECT_EQ
(
output
.
getHeight
(),
10
);
EXPECT_EQ
(
output
.
getHeight
(),
10
U
);
EXPECT_EQ
(
output
.
getWidth
(),
20
);
EXPECT_EQ
(
output
.
getWidth
(),
20
U
);
}
}
template
<
DeviceType
DType
>
template
<
DeviceType
DType
>
...
@@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs,
...
@@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs,
}
}
void
testBufferArgs
(
const
BufferArgs
&
inputs
,
const
CheckBufferArg
&
check
)
{
void
testBufferArgs
(
const
BufferArgs
&
inputs
,
const
CheckBufferArg
&
check
)
{
EXPECT_EQ
(
inputs
.
size
(),
1
);
EXPECT_EQ
(
inputs
.
size
(),
1
U
);
check
(
inputs
[
0
]);
check
(
inputs
[
0
]);
}
}
TEST
(
Arguments
,
Matrix
)
{
TEST
(
Arguments
,
Matrix
)
{
MatrixPtr
matrix
=
Matrix
::
create
(
100
,
200
);
MatrixPtr
matrix
=
Matrix
::
create
(
100
,
200
);
CheckBufferArg
check
=
[
=
](
const
BufferArg
&
arg
)
{
CheckBufferArg
check
=
[
=
](
const
BufferArg
&
arg
)
{
EXPECT_EQ
(
arg
.
shape
().
ndims
(),
2
);
EXPECT_EQ
(
arg
.
shape
().
ndims
(),
2
U
);
EXPECT_EQ
(
arg
.
shape
()[
0
],
100
);
EXPECT_EQ
(
arg
.
shape
()[
0
],
100
);
EXPECT_EQ
(
arg
.
shape
()[
1
],
200
);
EXPECT_EQ
(
arg
.
shape
()[
1
],
200
);
EXPECT_EQ
(
arg
.
data
(),
matrix
->
getData
());
EXPECT_EQ
(
arg
.
data
(),
matrix
->
getData
());
...
...
paddle/function/Im2ColOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -17,16 +17,21 @@ limitations under the License. */
...
@@ -17,16 +17,21 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
im2col
(
const
T
*
data_im
,
void
im2col
(
const
T
*
data_im
,
int
numOuts
,
int
height
,
int
width
,
int
numOuts
,
int
blockH
,
int
blockW
,
int
height
,
int
strideH
,
int
strideW
,
int
width
,
int
paddingH
,
int
paddingW
,
int
blockH
,
int
height_col
,
int
width_col
,
int
blockW
,
T
*
data_col
)
{
int
strideH
,
int
index
=
int
strideW
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
int
paddingH
,
int
paddingW
,
int
height_col
,
int
width_col
,
T
*
data_col
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
numOuts
)
{
if
(
index
<
numOuts
)
{
int
w_out
=
index
%
width_col
;
int
w_out
=
index
%
width_col
;
index
/=
width_col
;
index
/=
width_col
;
...
@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
...
@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
data_col
+=
(
channel_out
*
height_col
+
h_out
)
*
width_col
+
w_out
;
data_col
+=
(
channel_out
*
height_col
+
h_out
)
*
width_col
+
w_out
;
for
(
int
i
=
0
;
i
<
blockH
;
++
i
)
{
for
(
int
i
=
0
;
i
<
blockH
;
++
i
)
{
for
(
int
j
=
0
;
j
<
blockW
;
++
j
)
{
for
(
int
j
=
0
;
j
<
blockW
;
++
j
)
{
int
rIdx
=
int
(
h_in
+
i
);
int
rIdx
=
int
(
h_in
+
i
);
int
cIdx
=
int
(
w_in
+
j
);
int
cIdx
=
int
(
w_in
+
j
);
if
((
rIdx
-
(
int
)
paddingH
)
>=
(
int
)
height
||
if
((
rIdx
-
(
int
)
paddingH
)
>=
(
int
)
height
||
(
rIdx
-
(
int
)
paddingH
)
<
0
||
(
rIdx
-
(
int
)
paddingH
)
<
0
||
(
cIdx
-
(
int
)
paddingW
)
>=
(
int
)
width
||
(
cIdx
-
(
int
)
paddingW
)
>=
(
int
)
width
||
(
cIdx
-
(
int
)
paddingW
)
<
0
)
{
(
cIdx
-
(
int
)
paddingW
)
<
0
)
{
*
data_col
=
0
;
*
data_col
=
0
;
}
else
{
}
else
{
rIdx
=
rIdx
+
channel_in
*
height
-
paddingH
;
rIdx
=
rIdx
+
channel_in
*
height
-
paddingH
;
cIdx
=
cIdx
-
paddingW
;
cIdx
=
cIdx
-
paddingW
;
*
data_col
=
data_im
[
rIdx
*
width
+
cIdx
];
*
data_col
=
data_im
[
rIdx
*
width
+
cIdx
];
}
}
data_col
+=
height_col
*
width_col
;
data_col
+=
height_col
*
width_col
;
}
}
...
@@ -82,60 +87,73 @@ public:
...
@@ -82,60 +87,73 @@ public:
int
outputWidth
=
colShape
[
4
];
int
outputWidth
=
colShape
[
4
];
int
numKernels
=
inputChannels
*
outputHeight
*
outputWidth
;
int
numKernels
=
inputChannels
*
outputHeight
*
outputWidth
;
int
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
int
blockX
=
512
;
int
blockX
=
512
;
int
blockY
=
(
blocks
+
512
-
1
)
/
512
;
int
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
im2col
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
im2col
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
numKernels
,
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
numKernels
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
inputHeight
,
outputHeight
,
outputWidth
,
colData
);
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
,
colData
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
col2im
(
size_t
n
,
void
col2im
(
size_t
n
,
const
T
*
data_col
,
size_t
height
,
const
T
*
data_col
,
size_t
width
,
size_t
channels
,
size_t
height
,
size_t
blockH
,
size_t
blockW
,
size_t
width
,
size_t
strideH
,
size_t
strideW
,
size_t
channels
,
size_t
paddingH
,
size_t
paddingW
,
size_t
blockH
,
size_t
height_col
,
size_t
width_col
,
size_t
blockW
,
T
*
data_im
)
{
size_t
strideH
,
size_t
strideW
,
size_t
paddingH
,
size_t
paddingW
,
size_t
height_col
,
size_t
width_col
,
T
*
data_im
)
{
size_t
index
=
size_t
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
n
)
{
if
(
index
<
n
)
{
T
val
=
0
;
T
val
=
0
;
int
w
=
int
(
index
%
width
);
int
w
=
int
(
index
%
width
);
int
h
=
int
((
index
/
width
)
%
height
);
int
h
=
int
((
index
/
width
)
%
height
);
int
c
=
int
(
index
/
(
width
*
height
));
int
c
=
int
(
index
/
(
width
*
height
));
if
((
w
-
(
int
)
paddingW
)
>=
0
&&
if
((
w
-
(
int
)
paddingW
)
>=
0
&&
(
w
-
(
int
)
paddingW
)
<
(
width
-
2
*
paddingW
)
&&
(
w
-
(
int
)
paddingW
)
<
(
width
-
2
*
paddingW
)
&&
(
h
-
(
int
)
paddingH
)
>=
0
&&
(
h
-
(
int
)
paddingH
)
>=
0
&&
(
h
-
paddingH
)
<
(
height
-
2
*
paddingH
))
{
(
h
-
paddingH
)
<
(
height
-
2
*
paddingH
))
{
// compute the start and end of the output
// compute the start and end of the output
int
w_col_start
=
int
w_col_start
=
(
w
<
(
int
)
blockW
)
?
0
:
(
w
-
int
(
blockW
))
/
(
int
)
strideW
+
1
;
(
w
<
(
int
)
blockW
)
?
0
:
(
w
-
int
(
blockW
))
/
(
int
)
strideW
+
1
;
int
w_col_end
=
int
w_col_end
=
min
((
int
)(
w
/
(
int
)
strideW
+
1
),
(
int
)(
width_col
));
min
((
int
)(
w
/
(
int
)
strideW
+
1
),
(
int
)(
width_col
));
int
h_col_start
=
int
h_col_start
=
(
h
<
(
int
)
blockH
)
?
0
:
(
h
-
(
int
)
blockH
)
/
(
int
)
strideH
+
1
;
(
h
<
(
int
)
blockH
)
?
0
:
(
h
-
(
int
)
blockH
)
/
(
int
)
strideH
+
1
;
int
h_col_end
=
min
(
int
(
h
/
strideH
+
1
),
int
(
height_col
));
int
h_col_end
=
min
(
int
(
h
/
strideH
+
1
),
int
(
height_col
));
for
(
int
h_col
=
h_col_start
;
h_col
<
h_col_end
;
++
h_col
)
{
for
(
int
h_col
=
h_col_start
;
h_col
<
h_col_end
;
++
h_col
)
{
for
(
int
w_col
=
w_col_start
;
w_col
<
w_col_end
;
++
w_col
)
{
for
(
int
w_col
=
w_col_start
;
w_col
<
w_col_end
;
++
w_col
)
{
// the col location: [c * width * height + h_out, w_out]
// the col location: [c * width * height + h_out, w_out]
int
c_col
=
int
(
c
*
blockH
*
blockW
)
+
\
int
c_col
=
int
(
c
*
blockH
*
blockW
)
+
(
h
-
h_col
*
(
int
)
strideH
)
*
(
int
)
blockW
+
(
h
-
h_col
*
(
int
)
strideH
)
*
(
int
)
blockW
+
(
w
-
w_col
*
(
int
)
strideW
);
(
w
-
w_col
*
(
int
)
strideW
);
val
+=
data_col
[(
c_col
*
height_col
+
h_col
)
*
width_col
+
w_col
];
val
+=
data_col
[(
c_col
*
height_col
+
h_col
)
*
width_col
+
w_col
];
}
}
}
}
h
-=
paddingH
;
h
-=
paddingH
;
w
-=
paddingW
;
w
-=
paddingW
;
data_im
[
c
*
((
width
-
2
*
paddingW
)
*
(
height
-
2
*
paddingH
))
+
data_im
[
c
*
((
width
-
2
*
paddingW
)
*
(
height
-
2
*
paddingH
))
+
h
*
(
width
-
2
*
paddingW
)
+
w
]
+=
val
;
h
*
(
width
-
2
*
paddingW
)
+
w
]
+=
val
;
}
}
}
}
}
}
...
@@ -164,32 +182,32 @@ public:
...
@@ -164,32 +182,32 @@ public:
int
outputHeight
=
colShape
[
3
];
int
outputHeight
=
colShape
[
3
];
int
outputWidth
=
colShape
[
4
];
int
outputWidth
=
colShape
[
4
];
size_t
numKernels
=
inputChannels
*
(
inputHeight
+
2
*
paddingHeight
)
size_t
numKernels
=
inputChannels
*
(
inputHeight
+
2
*
paddingHeight
)
*
*
(
inputWidth
+
2
*
paddingWidth
);
(
inputWidth
+
2
*
paddingWidth
);
size_t
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
// To avoid involving atomic operations, we will launch one kernel per
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// bottom dimension, and then in the kernel add up the top dimensions.
col2im
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
col2im
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
numKernels
,
numKernels
,
colData
,
colData
,
inputHeight
+
2
*
paddingHeight
,
inputHeight
+
2
*
paddingHeight
,
inputWidth
+
2
*
paddingWidth
,
inputWidth
+
2
*
paddingWidth
,
inputChannels
,
inputChannels
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideHeight
,
strideHeight
,
strideWidth
,
strideWidth
,
paddingHeight
,
paddingHeight
,
paddingWidth
,
paddingWidth
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
imData
);
imData
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
}
}
};
};
...
@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
...
@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
float
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
float
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
double
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
double
>;
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
im2colOCF
(
const
T
*
imData
,
void
im2colOCF
(
const
T
*
imData
,
T
*
colData
,
T
*
colData
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputWidth
,
int
inputHeight
,
int
filterHeight
,
int
filterWidth
,
int
inputWidth
,
int
strideHeight
,
int
strideWidth
,
int
filterHeight
,
int
paddingHeight
,
int
paddingWidth
,
int
filterWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
strideHeight
,
int
strideWidth
,
int
paddingHeight
,
int
paddingWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
swId
=
blockIdx
.
x
;
int
swId
=
blockIdx
.
x
;
int
shId
=
blockIdx
.
y
;
int
shId
=
blockIdx
.
y
;
for
(
int
channelId
=
threadIdx
.
z
;
for
(
int
channelId
=
threadIdx
.
z
;
channelId
<
inputChannels
;
channelId
<
inputChannels
;
channelId
+=
blockDim
.
z
)
{
channelId
+=
blockDim
.
z
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
+
+
channelId
*
inputHeight
*
inputWidth
;
channelId
*
inputHeight
*
inputWidth
;
int
colOffset
=
idx
+
idy
*
filterWidth
int
colOffset
=
idx
+
idy
*
filterWidth
+
+
channelId
*
filterHeight
*
filterWidth
channelId
*
filterHeight
*
filterWidth
+
+
(
shId
*
outputWidth
+
swId
)
(
shId
*
outputWidth
+
swId
)
*
*
(
inputChannels
*
filterHeight
*
filterWidth
);
(
inputChannels
*
filterHeight
*
filterWidth
);
if
(
heightOffset
>=
inputHeight
||
heightOffset
<
0
||
if
(
heightOffset
>=
inputHeight
||
heightOffset
<
0
||
widthOffset
>=
inputWidth
||
widthOffset
<
0
)
{
widthOffset
>=
inputWidth
||
widthOffset
<
0
)
{
...
@@ -279,39 +301,52 @@ public:
...
@@ -279,39 +301,52 @@ public:
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
grid
(
outputWidth
,
outputHeight
);
dim3
grid
(
outputWidth
,
outputHeight
);
im2colOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
im2colOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
colData
,
inputChannels
,
inputHeight
,
inputWidth
,
colData
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
inputChannels
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
col2imOCF
(
T
*
imData
,
void
col2imOCF
(
T
*
imData
,
const
T
*
colData
,
const
T
*
colData
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputWidth
,
int
inputHeight
,
int
filterHeight
,
int
filterWidth
,
int
inputWidth
,
int
strideHeight
,
int
strideWidth
,
int
filterHeight
,
int
paddingHeight
,
int
paddingWidth
,
int
filterWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
strideHeight
,
int
strideWidth
,
int
paddingHeight
,
int
paddingWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
swId
=
blockIdx
.
x
;
int
swId
=
blockIdx
.
x
;
int
shId
=
blockIdx
.
y
;
int
shId
=
blockIdx
.
y
;
for
(
int
channelId
=
threadIdx
.
z
;
for
(
int
channelId
=
threadIdx
.
z
;
channelId
<
inputChannels
;
channelId
<
inputChannels
;
channelId
+=
blockDim
.
z
)
{
channelId
+=
blockDim
.
z
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
+
+
channelId
*
inputHeight
*
inputWidth
;
channelId
*
inputHeight
*
inputWidth
;
int
colOffset
=
idx
+
idy
*
filterWidth
int
colOffset
=
idx
+
idy
*
filterWidth
+
+
channelId
*
filterHeight
*
filterWidth
channelId
*
filterHeight
*
filterWidth
+
+
(
shId
*
outputWidth
+
swId
)
(
shId
*
outputWidth
+
swId
)
*
*
(
inputChannels
*
filterHeight
*
filterWidth
);
(
inputChannels
*
filterHeight
*
filterWidth
);
if
(
heightOffset
>=
0
&&
heightOffset
<
inputHeight
&&
if
(
heightOffset
>=
0
&&
heightOffset
<
inputHeight
&&
widthOffset
>=
0
&&
widthOffset
<
inputWidth
)
{
widthOffset
>=
0
&&
widthOffset
<
inputWidth
)
{
...
@@ -365,10 +400,19 @@ public:
...
@@ -365,10 +400,19 @@ public:
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
grid
(
outputWidth
,
outputHeight
);
dim3
grid
(
outputWidth
,
outputHeight
);
col2imOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
col2imOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
colData
,
inputChannels
,
inputHeight
,
inputWidth
,
colData
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
inputChannels
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
}
}
};
};
...
...
paddle/function/MulOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "MulOp.h"
#include "MulOp.h"
#include "hl_base.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
#include "paddle/math/SparseMatrix.h"
...
...
paddle/function/PadOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "PadOp.h"
#include "PadOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KePad
(
real
*
outputs
,
const
real
*
inputs
,
__global__
void
KePad
(
real
*
outputs
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
inputs
,
int
padc
,
int
padh
,
int
padw
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
padc
,
int
padh
,
int
padw
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
int
outC
=
inC
+
cstart
+
cend
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
int
outW
=
inW
+
wstart
+
wend
;
KePad
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KePad
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
outputs
,
(
outputs
,
inputs
,
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
inputs
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"Pad"
);
CHECK_SYNC
(
"Pad"
);
}
}
__global__
void
KePadDiff
(
real
*
inGrad
,
const
real
*
outGrad
,
__global__
void
KePadDiff
(
real
*
inGrad
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
outGrad
,
int
padc
,
int
padh
,
int
padw
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
padc
,
int
padh
,
int
padw
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
...
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int
outC
=
inC
+
cstart
+
cend
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
int
outW
=
inW
+
wstart
+
wend
;
KePadDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KePadDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
(
inGrad
,
outGrad
,
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outGrad
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"PadGrad"
);
CHECK_SYNC
(
"PadGrad"
);
}
}
...
...
paddle/function/RowConvOpGpu.cu
浏览文件 @
59a8ebc6
...
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "RowConvOp.h"
#include "RowConvOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConv
(
real
*
y
,
const
real
*
x
,
const
real
*
w
,
__global__
void
KeRowConv
(
real
*
y
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
real
*
x
,
const
int
numSeq
,
const
int
context
)
{
const
real
*
w
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
...
@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
...
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
}
}
}
}
__global__
void
KeRowConv2
(
real
*
y
,
const
real
*
x
,
const
real
*
w
,
__global__
void
KeRowConv2
(
real
*
y
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
real
*
x
,
const
int
numSeq
,
const
int
context
)
{
const
real
*
w
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w,
...
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w,
}
}
}
}
template
<
>
template
<
>
void
RowConv
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
void
RowConv
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
in
,
...
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConv2
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv2
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
CHECK_SYNC
(
"RowConv"
);
CHECK_SYNC
(
"RowConv"
);
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
,
int
CONTEXT
>
template
<
int
BLOCK_H
,
int
BLOCK_W
,
int
CONTEXT
>
__global__
void
KeRowConvBwWeight
(
real
*
dw
,
__global__
void
KeRowConvBwWeight
(
real
*
dw
,
const
real
*
x
,
const
real
*
dy
,
const
real
*
x
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
dy
,
const
int
context
)
{
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_x
[
tidx
][
tidy
]
=
x
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
if
(
tidy
<
(
context
-
1
))
{
if
(
tidy
<
(
context
-
1
))
{
yoff
=
yoff
-
context
+
1
;
yoff
=
yoff
-
context
+
1
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
>=
start
)
?
sh_dy
[
tidx
][
tidy
]
=
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
>=
start
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
}
}
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConvBwWeight2
(
real
*
dw
,
const
real
*
x
,
const
real
*
dy
,
__global__
void
KeRowConvBwWeight2
(
real
*
dw
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
x
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
...
@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_x
[
tidx
][
tidy
]
=
x
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
sh_dy
[
tidx
][
tidy
]
=
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
...
@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
__syncthreads
();
__syncthreads
();
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
}
}
}
}
}
}
}
}
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConvBwData
(
real
*
dx
,
const
real
*
w
,
const
real
*
dy
,
__global__
void
KeRowConvBwData
(
real
*
dx
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
w
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
...
@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
...
@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
}
}
}
}
__global__
void
KeRowConvBwData2
(
real
*
dx
,
const
real
*
w
,
const
real
*
dy
,
__global__
void
KeRowConvBwData2
(
real
*
dx
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
w
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
...
@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
}
}
}
}
template
<
>
template
<
>
void
RowConvGrad
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
outG
,
void
RowConvGrad
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
outG
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
filter
,
const
GpuMatrix
&
filter
,
GpuMatrix
&
inG
,
GpuMatrix
&
inG
,
GpuMatrix
&
filterG
,
GpuMatrix
&
filterG
,
const
GpuIVector
&
seq
)
{
const
GpuIVector
&
seq
)
{
const
size_t
numSeq
=
seq
.
getSize
()
-
1
;
const
size_t
numSeq
=
seq
.
getSize
()
-
1
;
const
size_t
contextLength
=
filter
.
getHeight
();
const
size_t
contextLength
=
filter
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
...
@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
real
*
dw
=
filterG
.
getData
();
real
*
dw
=
filterG
.
getData
();
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConvBwWeight
<
32
,
32
,
32
>
KeRowConvBwWeight
<
32
,
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConvBwWeight2
<
32
,
32
>
KeRowConvBwWeight2
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
}
...
@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3
dimBlock2
(
32
,
32
);
dim3
dimBlock2
(
32
,
32
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
if
(
contextLength
<=
64
)
{
if
(
contextLength
<=
64
)
{
KeRowConvBwData
<
32
,
64
>
KeRowConvBwData
<
32
,
64
><<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConvBwData2
KeRowConvBwData2
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
}
...
...
paddle/function/TensorShapeTest.cpp
浏览文件 @
59a8ebc6
...
@@ -19,35 +19,35 @@ namespace paddle {
...
@@ -19,35 +19,35 @@ namespace paddle {
TEST
(
TensorShape
,
Constructor
)
{
TEST
(
TensorShape
,
Constructor
)
{
TensorShape
t1
;
TensorShape
t1
;
EXPECT_EQ
(
t1
.
ndims
(),
0
);
EXPECT_EQ
(
t1
.
ndims
(),
0
U
);
EXPECT_EQ
(
t1
.
getElements
(),
0
);
EXPECT_EQ
(
t1
.
getElements
(),
0
U
);
TensorShape
t2
(
3
);
TensorShape
t2
(
3
);
EXPECT_EQ
(
t2
.
ndims
(),
3
);
EXPECT_EQ
(
t2
.
ndims
(),
3
U
);
EXPECT_EQ
(
t2
.
getElements
(),
1
);
EXPECT_EQ
(
t2
.
getElements
(),
1
U
);
TensorShape
t3
({
8
,
10
});
TensorShape
t3
({
8
,
10
});
EXPECT_EQ
(
t3
.
ndims
(),
2
);
EXPECT_EQ
(
t3
.
ndims
(),
2
U
);
EXPECT_EQ
(
t3
.
getElements
(),
80
);
EXPECT_EQ
(
t3
.
getElements
(),
80
U
);
TensorShape
t4
(
t3
);
TensorShape
t4
(
t3
);
EXPECT_EQ
(
t4
.
ndims
(),
t3
.
ndims
());
EXPECT_EQ
(
t4
.
ndims
(),
t3
.
ndims
());
EXPECT_EQ
(
t4
.
getElements
(),
t3
.
getElements
());
EXPECT_EQ
(
t4
.
getElements
(),
t3
.
getElements
());
TensorShape
t5
({
1
,
2
,
3
,
4
,
5
});
TensorShape
t5
({
1
,
2
,
3
,
4
,
5
});
EXPECT_EQ
(
t5
.
ndims
(),
5
);
EXPECT_EQ
(
t5
.
ndims
(),
5
U
);
EXPECT_EQ
(
t5
.
getElements
(),
120
);
EXPECT_EQ
(
t5
.
getElements
(),
120
U
);
}
}
TEST
(
TensorShape
,
GetAndSet
)
{
TEST
(
TensorShape
,
GetAndSet
)
{
TensorShape
t
({
1
,
2
,
3
});
TensorShape
t
({
1
,
2
,
3
});
EXPECT_EQ
(
t
.
ndims
(),
3
);
EXPECT_EQ
(
t
.
ndims
(),
3
U
);
EXPECT_EQ
(
t
.
getElements
(),
6
);
EXPECT_EQ
(
t
.
getElements
(),
6
U
);
EXPECT_EQ
(
t
[
1
],
2
);
EXPECT_EQ
(
t
[
1
],
2
);
t
.
setDim
(
1
,
100
);
t
.
setDim
(
1
,
100
);
EXPECT_EQ
(
t
.
getElements
(),
300
);
EXPECT_EQ
(
t
.
getElements
(),
300
U
);
EXPECT_EQ
(
t
[
1
],
100
);
EXPECT_EQ
(
t
[
1
],
100
U
);
}
}
}
// namespace paddle
}
// namespace paddle
paddle/function/TensorTypeTest.cpp
浏览文件 @
59a8ebc6
...
@@ -19,9 +19,9 @@ namespace paddle {
...
@@ -19,9 +19,9 @@ namespace paddle {
TEST
(
TensorType
,
Matrix
)
{
TEST
(
TensorType
,
Matrix
)
{
Tensor
<
real
,
DEVICE_TYPE_CPU
>::
Matrix
matrix
(
100
,
200
);
Tensor
<
real
,
DEVICE_TYPE_CPU
>::
Matrix
matrix
(
100
,
200
);
EXPECT_EQ
(
matrix
.
getHeight
(),
100
);
EXPECT_EQ
(
matrix
.
getHeight
(),
100
U
);
EXPECT_EQ
(
matrix
.
getWidth
(),
200
);
EXPECT_EQ
(
matrix
.
getWidth
(),
200
U
);
EXPECT_EQ
(
matrix
.
getElementCnt
(),
100
*
200
);
EXPECT_EQ
(
matrix
.
getElementCnt
(),
100
U
*
200U
);
EXPECT_EQ
(
matrix
.
useGpu
(),
false
);
EXPECT_EQ
(
matrix
.
useGpu
(),
false
);
Tensor
<
real
,
DEVICE_TYPE_GPU
>::
Matrix
testGpu
(
100
,
200
);
Tensor
<
real
,
DEVICE_TYPE_GPU
>::
Matrix
testGpu
(
100
,
200
);
...
@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
...
@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
Tensor
<
real
,
DEVICE_TYPE_GPU
>::
Vector
gpuVector
(
100
);
Tensor
<
real
,
DEVICE_TYPE_GPU
>::
Vector
gpuVector
(
100
);
EXPECT_EQ
(
cpuVector
.
useGpu
(),
false
);
EXPECT_EQ
(
cpuVector
.
useGpu
(),
false
);
EXPECT_EQ
(
gpuVector
.
useGpu
(),
true
);
EXPECT_EQ
(
gpuVector
.
useGpu
(),
true
);
EXPECT_EQ
(
cpuVector
.
getSize
(),
100
);
EXPECT_EQ
(
cpuVector
.
getSize
(),
100
U
);
EXPECT_EQ
(
gpuVector
.
getSize
(),
100
);
EXPECT_EQ
(
gpuVector
.
getSize
(),
100
U
);
Tensor
<
int
,
DEVICE_TYPE_CPU
>::
Vector
cpuIVector
(
100
);
Tensor
<
int
,
DEVICE_TYPE_CPU
>::
Vector
cpuIVector
(
100
);
Tensor
<
int
,
DEVICE_TYPE_GPU
>::
Vector
gpuIVector
(
100
);
Tensor
<
int
,
DEVICE_TYPE_GPU
>::
Vector
gpuIVector
(
100
);
EXPECT_EQ
(
cpuIVector
.
useGpu
(),
false
);
EXPECT_EQ
(
cpuIVector
.
useGpu
(),
false
);
EXPECT_EQ
(
gpuIVector
.
useGpu
(),
true
);
EXPECT_EQ
(
gpuIVector
.
useGpu
(),
true
);
EXPECT_EQ
(
cpuIVector
.
getSize
(),
100
);
EXPECT_EQ
(
cpuIVector
.
getSize
(),
100
U
);
EXPECT_EQ
(
gpuIVector
.
getSize
(),
100
);
EXPECT_EQ
(
gpuIVector
.
getSize
(),
100
U
);
}
}
TEST
(
TensorType
,
EmptyMatrix
)
{
TEST
(
TensorType
,
EmptyMatrix
)
{
...
...
paddle/function/nnpack/NNPACKConvOp.cpp
浏览文件 @
59a8ebc6
...
@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase {
...
@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase {
public:
public:
void
init
(
const
FuncConfig
&
config
)
override
{
void
init
(
const
FuncConfig
&
config
)
override
{
ConvFunctionBase
::
init
(
config
);
ConvFunctionBase
::
init
(
config
);
CHECK_EQ
(
groups_
,
(
size_t
)
1
);
algorithm_
=
get_nnp_convolution_algorithm
(
config
.
get
<
std
::
string
>
(
"algo"
));
algorithm_
=
get_nnp_convolution_algorithm
(
config
.
get
<
std
::
string
>
(
"algo"
));
// algorithm_ = nnp_convolution_algorithm_auto;
transform_strategy_
=
nnp_convolution_transform_strategy_compute
;
transform_strategy_
=
nnp_convolution_transform_strategy_compute
;
nnp_status
status
=
nnp_initialize
();
nnp_status
status
=
nnp_initialize
();
CHECK_EQ
(
status
,
nnp_status_success
);
CHECK_EQ
(
status
,
nnp_status_success
);
...
@@ -67,8 +65,7 @@ public:
...
@@ -67,8 +65,7 @@ public:
}
}
}
}
virtual
void
check
(
const
BufferArgs
&
inputs
,
void
check
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
const
BufferArgs
&
outputs
)
override
{
const
TensorShape
&
input
=
inputs
[
0
].
shape
();
const
TensorShape
&
input
=
inputs
[
0
].
shape
();
const
TensorShape
&
filter
=
inputs
[
1
].
shape
();
const
TensorShape
&
filter
=
inputs
[
1
].
shape
();
const
TensorShape
&
output
=
outputs
[
0
].
shape
();
const
TensorShape
&
output
=
outputs
[
0
].
shape
();
...
@@ -91,8 +88,8 @@ public:
...
@@ -91,8 +88,8 @@ public:
size_t
filterHeight
=
getFilterHeight
(
filter
);
size_t
filterHeight
=
getFilterHeight
(
filter
);
size_t
filterWidth
=
getFilterWidth
(
filter
);
size_t
filterWidth
=
getFilterWidth
(
filter
);
size_t
outputChannels
=
output
[
1
];
size_t
outputChannels
=
output
[
1
];
//
size_t outputHeight = output[2];
size_t
outputHeight
=
output
[
2
];
//
size_t outputWidth = output[3];
size_t
outputWidth
=
output
[
3
];
nnp_size
inputSize
=
{.
width
=
inputWidth
,
.
height
=
inputHeight
};
nnp_size
inputSize
=
{.
width
=
inputWidth
,
.
height
=
inputHeight
};
nnp_padding
padding
=
{.
top
=
(
size_t
)
paddingH
(),
nnp_padding
padding
=
{.
top
=
(
size_t
)
paddingH
(),
...
@@ -171,49 +168,58 @@ public:
...
@@ -171,49 +168,58 @@ public:
}
}
}
}
size_t
inputOffset
=
inputChannels
/
groups_
*
inputHeight
*
inputWidth
;
size_t
outputOffset
=
outputChannels
/
groups_
*
outputHeight
*
outputWidth
;
size_t
filterOffset
=
filter
.
getElements
()
/
groups_
;
if
(
batchSize
==
1
)
{
if
(
batchSize
==
1
)
{
nnp_status
status
=
for
(
size_t
g
=
0
;
g
<
groups_
;
g
++
)
{
nnp_convolution_inference
(
algorithm_
,
nnp_status
status
=
transform_strategy_
,
nnp_convolution_inference
(
algorithm_
,
inputChannels
,
transform_strategy_
,
outputChannels
,
inputChannels
/
groups_
,
inputSize
,
outputChannels
/
groups_
,
padding
,
inputSize
,
kernelSize
,
padding
,
outputSubsampling
,
kernelSize
,
inputData
,
outputSubsampling
,
filterData
,
inputData
+
inputOffset
*
g
,
nullptr
,
/* bias */
filterData
+
filterOffset
*
g
,
outputData
,
nullptr
,
/* bias */
bufferPtr
,
outputData
+
outputOffset
*
g
,
sizePtr
,
bufferPtr
,
nnp_activation_identity
,
sizePtr
,
nullptr
,
nnp_activation_identity
,
threadpool_
,
/* threadpool */
nullptr
,
nullptr
);
threadpool_
,
/* threadpool */
CHECK_EQ
(
status
,
nnp_status_success
);
nullptr
);
CHECK_EQ
(
status
,
nnp_status_success
);
}
}
else
{
}
else
{
// only supports stride = 1
for
(
size_t
g
=
0
;
g
<
groups_
;
g
++
)
{
CHECK_EQ
(
strideH
(),
1
);
// only supports stride = 1
CHECK_EQ
(
strideW
(),
1
);
CHECK_EQ
(
strideH
(),
1
);
nnp_status
status
=
nnp_convolution_output
(
algorithm_
,
CHECK_EQ
(
strideW
(),
1
);
batchSize
,
nnp_status
status
=
inputChannels
,
nnp_convolution_output
(
algorithm_
,
outputChannels
,
batchSize
,
inputSize
,
inputChannels
/
groups_
,
padding
,
outputChannels
/
groups_
,
kernelSize
,
inputSize
,
inputData
,
padding
,
filterData
,
kernelSize
,
nullptr
,
/* bias */
inputData
+
inputOffset
*
g
,
outputData
,
filterData
+
filterOffset
*
g
,
bufferPtr
,
nullptr
,
/* bias */
sizePtr
,
outputData
+
outputOffset
*
g
,
nnp_activation_identity
,
bufferPtr
,
nullptr
,
sizePtr
,
threadpool_
,
/* threadpool */
nnp_activation_identity
,
nullptr
);
nullptr
,
CHECK_EQ
(
status
,
nnp_status_success
);
threadpool_
,
/* threadpool */
nullptr
);
CHECK_EQ
(
status
,
nnp_status_success
);
}
}
}
}
}
...
...
paddle/gserver/activations/ActivationFunction.cpp
浏览文件 @
59a8ebc6
...
@@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) {
...
@@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) {
useGpu
(
act
.
deviceId
));
useGpu
(
act
.
deviceId
));
}
}
auto
starts
=
act
.
sequenceStartPositions
->
getVector
(
useGpu
(
act
.
deviceId
));
auto
starts
=
act
.
hasSubseq
()
?
act
.
subSequenceStartPositions
->
getVector
(
useGpu
(
act
.
deviceId
))
:
act
.
sequenceStartPositions
->
getVector
(
useGpu
(
act
.
deviceId
));
act
.
value
->
sequenceSoftmax
(
*
act
.
value
,
*
starts
);
act
.
value
->
sequenceSoftmax
(
*
act
.
value
,
*
starts
);
return
Error
();
return
Error
();
}
}
...
@@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) {
...
@@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) {
"Input width for each timestep of sequence softmax should be 1"
);
"Input width for each timestep of sequence softmax should be 1"
);
}
}
size_t
numSequences
=
act
.
getNumSequences
();
size_t
numSequences
=
const
int
*
starts
=
act
.
sequenceStartPositions
->
getData
(
false
);
act
.
hasSubseq
()
?
act
.
getNumSubSequences
()
:
act
.
getNumSequences
();
const
int
*
starts
=
act
.
getCpuStartPositions
();
for
(
size_t
i
=
0
;
i
<
numSequences
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
numSequences
;
++
i
)
{
// TODO(Dangqingqing) optimization for GPU
// TODO(Dangqingqing) optimization for GPU
...
...
paddle/gserver/layers/ExpandConvLayer.cpp
浏览文件 @
59a8ebc6
...
@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
...
@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
convGradFilterType
=
"GemmConvGradFilter"
;
convGradFilterType
=
"GemmConvGradFilter"
;
}
}
if
(
FLAGS_use_nnpack
)
{
if
(
FLAGS_use_nnpack
&&
!
isDeconv_
)
{
CHECK_EQ
(
isDeconv_
,
false
);
createFunction
(
forward_
,
createFunction
(
forward_
,
"NNPACKConv"
,
"NNPACKConv"
,
FuncConfig
()
FuncConfig
()
...
...
paddle/gserver/layers/GruCompute.cu
浏览文件 @
59a8ebc6
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "GruCompute.h"
#include "GruCompute.h"
#include "hl_recurrent_apply.cuh"
#include "hl_recurrent_apply.cuh"
...
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
...
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
}
}
template
<
>
template
<
>
void
GruCompute
::
backward
<
1
>
(
hl_gru_value
value
,
hl_gru_grad
grad
,
void
GruCompute
::
backward
<
1
>
(
hl_gru_value
value
,
int
frameSize
,
int
batchSize
)
{
hl_gru_grad
grad
,
int
frameSize
,
int
batchSize
)
{
hl_gpu_gru_backward
(
hppl
::
backward
::
gru_stateGrad
(),
hl_gpu_gru_backward
(
hppl
::
backward
::
gru_stateGrad
(),
hppl
::
backward
::
gru_resetGrad
(),
hppl
::
backward
::
gru_resetGrad
(),
value
,
value
,
...
...
paddle/gserver/layers/KmaxSeqScoreLayer.cpp
浏览文件 @
59a8ebc6
...
@@ -97,13 +97,19 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
...
@@ -97,13 +97,19 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
scores_
=
inputScore
;
scores_
=
inputScore
;
}
}
int
seqNum
=
Matrix
::
resizeOrCreate
(
input
.
hasSubseq
()
?
input
.
getNumSubSequences
()
:
input
.
getNumSequences
();
output_
.
value
,
Matrix
::
resizeOrCreate
(
output_
.
value
,
seqNum
,
beamSize_
,
false
,
false
);
input
.
hasSubseq
()
?
input
.
getNumSubSequences
()
:
input
.
getNumSequences
(),
beamSize_
,
false
,
false
);
output_
.
value
->
one
();
output_
.
value
->
one
();
output_
.
value
->
mulScalar
(
-
1.
);
output_
.
value
->
mulScalar
(
-
1.
);
kmaxScorePerSeq
(
scores_
->
getData
(),
output_
.
value
->
getData
(),
seqNum
);
kmaxScorePerSeq
(
scores_
->
getData
(),
output_
.
value
->
getData
(),
input
.
hasSubseq
()
?
input
.
subSequenceStartPositions
:
input
.
sequenceStartPositions
);
}
}
void
KmaxSeqScoreLayer
::
backward
(
const
UpdateCallback
&
callback
)
{}
void
KmaxSeqScoreLayer
::
backward
(
const
UpdateCallback
&
callback
)
{}
...
...
paddle/gserver/layers/LstmCompute.cu
浏览文件 @
59a8ebc6
...
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "LstmCompute.h"
#include "LstmCompute.h"
#include "hl_recurrent_apply.cuh"
#include "hl_recurrent_apply.cuh"
namespace
paddle
{
namespace
paddle
{
template
<
>
template
<
>
void
LstmCompute
::
forwardBatch
<
1
>
(
hl_lstm_value
value
,
int
frameSize
,
void
LstmCompute
::
forwardBatch
<
1
>
(
hl_lstm_value
value
,
int
batchSize
)
{
int
frameSize
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
frameSize
,
int
batchSize
)
{
batchSize
,
activeNode_
,
activeGate_
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
frameSize
,
batchSize
,
activeNode_
,
activeGate_
,
activeState_
);
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
backwardBatch
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
void
LstmCompute
::
backwardBatch
<
1
>
(
hl_lstm_value
value
,
int
frameSize
,
int
batchSize
)
{
hl_lstm_grad
grad
,
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
int
frameSize
,
frameSize
,
batchSize
,
activeNode_
,
int
batchSize
)
{
activeGate_
,
activeState_
);
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
frameSize
,
batchSize
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
forwardOneSequence
<
1
>
(
hl_lstm_value
value
,
int
frameSize
)
{
void
LstmCompute
::
forwardOneSequence
<
1
>
(
hl_lstm_value
value
,
int
frameSize
)
{
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
frameSize
,
/* batchSize */
1
,
value
,
activeNode_
,
activeGate_
,
activeState_
);
frameSize
,
/* batchSize */
1
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
backwardOneSequence
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
void
LstmCompute
::
backwardOneSequence
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
int
frameSize
)
{
int
frameSize
)
{
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
frameSize
,
/* batchSize */
1
,
value
,
activeNode_
,
activeGate_
,
activeState_
);
grad
,
frameSize
,
/* batchSize */
1
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
}
// namespace paddle
}
// namespace paddle
paddle/gserver/layers/PrintLayer.cpp
浏览文件 @
59a8ebc6
...
@@ -29,7 +29,7 @@ public:
...
@@ -29,7 +29,7 @@ public:
vals
.
push_back
(
s
.
str
());
vals
.
push_back
(
s
.
str
());
}
}
size_t
pos
=
0
;
size_t
pos
=
0
;
in
t
i
=
0
;
size_
t
i
=
0
;
std
::
ostringstream
s
;
std
::
ostringstream
s
;
const
std
::
string
&
format
=
config_
.
user_arg
();
const
std
::
string
&
format
=
config_
.
user_arg
();
while
(
true
)
{
while
(
true
)
{
...
...
paddle/gserver/tests/CMakeLists.txt
浏览文件 @
59a8ebc6
# gserver pacakge unittests
# gserver pacakge unittests
file
(
GLOB_RECURSE GSERVER_HEADER RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*.h"
)
file
(
GLOB_RECURSE GSERVER_SOURCES RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*.cpp"
)
add_style_check_target
(
paddle_gserver
${
GSERVER_SOURCES
}
)
add_style_check_target
(
paddle_gserver
${
GSERVER_HEADER
}
)
################### test_ProtoDataProvider ############
################### test_ProtoDataProvider ############
add_unittest_without_exec
(
test_ProtoDataProvider
add_unittest_without_exec
(
test_ProtoDataProvider
test_ProtoDataProvider.cpp
)
test_ProtoDataProvider.cpp
)
...
...
paddle/gserver/tests/test_ActivationGrad.cpp
浏览文件 @
59a8ebc6
...
@@ -57,6 +57,39 @@ TEST(Activation, activation) {
...
@@ -57,6 +57,39 @@ TEST(Activation, activation) {
}
}
}
}
void
testSequenceSoftmaxAct
(
bool
hasSubseq
)
{
LOG
(
INFO
)
<<
"test activation: sequence softmax"
;
const
size_t
size
=
1
;
TestConfig
config
;
config
.
biasSize
=
0
;
config
.
layerConfig
.
set_type
(
"addto"
);
config
.
layerConfig
.
set_size
(
size
);
config
.
layerConfig
.
set_active_type
(
"sequence_softmax"
);
config
.
inputDefs
.
push_back
(
{
hasSubseq
?
INPUT_HASSUB_SEQUENCE_DATA
:
INPUT_SEQUENCE_DATA
,
"layer_0"
,
1
,
0
});
config
.
layerConfig
.
add_inputs
();
for
(
auto
useGpu
:
{
false
,
true
})
{
testLayerGrad
(
config
,
"sequence_softmax"
,
100
,
/* trans= */
false
,
useGpu
,
/* useWeight */
true
);
}
}
TEST
(
SequenceSoftmaxActivation
,
activation
)
{
for
(
auto
hasSubseq
:
{
false
,
true
})
{
LOG
(
INFO
)
<<
"hasSubseq = "
<<
hasSubseq
;
testSequenceSoftmaxAct
(
hasSubseq
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
int
main
(
int
argc
,
char
**
argv
)
{
testing
::
InitGoogleTest
(
&
argc
,
argv
);
testing
::
InitGoogleTest
(
&
argc
,
argv
);
initMain
(
argc
,
argv
);
initMain
(
argc
,
argv
);
...
...
paddle/math/BaseMatrix.cu
浏览文件 @
59a8ebc6
...
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <string.h>
#include <paddle/utils/Logging.h>
#include <paddle/utils/Logging.h>
#include <string.h>
#include <cmath>
#include "BaseMatrix.h"
#include "BaseMatrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_apply.cuh"
#include "SIMDFunctions.h"
#include "MathFunctions.h"
#include "MathFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_ops.cuh"
namespace
paddle
{
namespace
paddle
{
const
char
*
SPARSE_SUPPORT_ERROR
=
"Sparse Matrix/Vector is not supported."
;
const
char
*
SPARSE_SUPPORT_ERROR
=
"Sparse Matrix/Vector is not supported."
;
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
)
{
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
)
{
MatrixOffset
offset
(
0
,
0
);
MatrixOffset
offset
(
0
,
0
);
...
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
...
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
int
dimM
=
numRows
;
int
dimM
=
numRows
;
...
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
...
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
)
{
CHECK
(
height_
==
b
.
height_
&&
width_
==
b
.
width_
)
CHECK
(
height_
==
b
.
height_
&&
width_
==
b
.
width_
)
...
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
...
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyBinary
(
MatrixOffset
&
offset
)
{
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
applyBinary
(
op
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
applyBinary
(
op
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
,
class
bAsRowVector
,
class
bAsColVector
>
template
<
class
Op
,
class
bAsRowVector
,
class
bAsColVector
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
MatrixOffset
&
offset
,
bAsRowVector
,
bAsColVector
)
{
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
bAsRowVector
,
bAsColVector
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
useGpu_
==
b
.
useGpu_
)
<<
"Matrix type mismatch"
;
CHECK
(
useGpu_
==
b
.
useGpu_
)
<<
"Matrix type mismatch"
;
...
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
...
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
T
*
A
=
data_
;
T
*
A
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
if
(
!
bAsRowVector
::
value
&&
!
bAsColVector
::
value
)
{
if
(
!
bAsRowVector
::
value
&&
!
bAsColVector
::
value
)
{
...
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
...
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
height_
,
b
.
height_
);
...
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
int
numRows
,
int
numCols
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
applyTernary
(
op
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
applyTernary
(
op
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
,
class
cAsRowVector
,
class
cAsColVector
>
template
<
class
Op
,
class
cAsRowVector
,
class
cAsColVector
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
BaseMatrixT
&
b
,
cAsRowVector
,
cAsColVector
)
{
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
cAsRowVector
,
cAsColVector
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
c
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
c
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
...
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
...
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
}
}
if
(
true
==
useGpu_
)
{
if
(
true
==
useGpu_
)
{
hl_gpu_apply_ternary_op
hl_gpu_apply_ternary_op
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
}
else
{
}
else
{
hl_cpu_apply_ternary_op
hl_cpu_apply_ternary_op
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
}
}
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
width_
,
b
.
width_
);
CHECK_EQ
(
width_
,
b
.
width_
);
...
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
d
,
int
numRows
,
int
numCols
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
...
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
T
*
D
=
d
.
data_
;
T
*
D
=
d
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
CAL_MATRIX_START_ADDRESS
(
D
,
d
.
height_
,
d
.
width_
,
ldd
,
offset
.
dCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
dRow_
);
D
,
d
.
height_
,
d
.
width_
,
ldd
,
offset
.
dCol_
,
offset
.
dRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
...
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
CHECK_LE
(
dimM
+
offset
.
dRow_
,
d
.
height_
);
CHECK_LE
(
dimM
+
offset
.
dRow_
,
d
.
height_
);
CHECK_LE
(
dimN
+
offset
.
dCol_
,
d
.
width_
);
CHECK_LE
(
dimN
+
offset
.
dCol_
,
d
.
width_
);
if
(
true
==
useGpu_
)
{
if
(
true
==
useGpu_
)
{
hl_gpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
hl_gpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
,
ldd
);
ldc
,
ldd
);
}
else
{
}
else
{
hl_cpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
hl_cpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
,
ldd
);
ldc
,
ldd
);
}
}
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
class
aAsColVector
>
class
aAsColVector
>
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
Op
op
,
aAsRowVector
,
aAsColVector
)
{
Saver
sv
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
aAsRowVector
,
aAsColVector
)
{
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
int
ld
=
stride_
;
int
ld
=
stride_
;
...
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T
*
dst
=
data_
;
T
*
dst
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
CAL_MATRIX_START_ADDRESS
(
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
aRow_
);
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
...
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
class
aAsColVector
>
class
aAsColVector
>
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
Op
op
,
MatrixOffset
&
offset
,
aAsRowVector
,
Saver
sv
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
aAsRowVector
,
aAsColVector
)
{
aAsColVector
)
{
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
c
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
c
.
useGpu_
);
...
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T
*
dst
=
data_
;
T
*
dst
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
CAL_MATRIX_START_ADDRESS
(
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
aRow_
);
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
hl_gpu_matrix_column_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
hl_gpu_matrix_column_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
ldb
,
C
,
ldc
);
}
else
{
}
else
{
hl_cpu_matrix_column_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
hl_cpu_matrix_column_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
ldb
,
C
,
ldc
);
}
}
}
else
if
(
!
aAsRowVector
::
value
&&
aAsColVector
::
value
)
{
}
else
if
(
!
aAsRowVector
::
value
&&
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
hl_gpu_matrix_row_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
hl_gpu_matrix_row_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
ldb
,
C
,
ldc
);
}
else
{
}
else
{
hl_cpu_matrix_row_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
hl_cpu_matrix_row_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
ldb
,
C
,
ldc
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"not supported"
;
LOG
(
FATAL
)
<<
"not supported"
;
...
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
*/
*/
DEFINE_MATRIX_UNARY_OP
(
Neg
,
a
=
-
a
);
DEFINE_MATRIX_UNARY_OP
(
Neg
,
a
=
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
neg
()
{
applyUnary
(
unary
::
Neg
<
T
>
());
}
void
BaseMatrixT
<
T
>::
neg
()
{
applyUnary
(
unary
::
Neg
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Exp
,
a
=
exp
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Exp
,
a
=
exp
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
exp2
()
{
applyUnary
(
unary
::
Exp
<
real
>
());
}
void
BaseMatrixT
<
real
>::
exp2
()
{
applyUnary
(
unary
::
Exp
<
real
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Log
,
a
=
log
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Log
,
a
=
log
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
log2
()
{
void
BaseMatrixT
<
real
>::
log2
()
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
Log
<
real
>
());
applyUnary
(
unary
::
Log
<
real
>
());
...
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
...
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
}
}
DEFINE_MATRIX_UNARY_OP
(
Sqrt
,
a
=
sqrt
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Sqrt
,
a
=
sqrt
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sqrt2
()
{
applyUnary
(
unary
::
Sqrt
<
real
>
());
}
void
BaseMatrixT
<
real
>::
sqrt2
()
{
applyUnary
(
unary
::
Sqrt
<
real
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Square
,
a
=
a
*
a
);
DEFINE_MATRIX_UNARY_OP
(
Square
,
a
=
a
*
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
square2
()
{
applyUnary
(
unary
::
Square
<
T
>
());
}
void
BaseMatrixT
<
T
>::
square2
()
{
applyUnary
(
unary
::
Square
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Reciprocal
,
a
=
1.0
f
/
a
);
DEFINE_MATRIX_UNARY_OP
(
Reciprocal
,
a
=
1.0
f
/
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
()
{
applyUnary
(
unary
::
Reciprocal
<
T
>
());
}
void
BaseMatrixT
<
T
>::
reciprocal2
()
{
applyUnary
(
unary
::
Reciprocal
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Abs
,
a
=
a
>
0
?
a
:
-
a
);
DEFINE_MATRIX_UNARY_OP
(
Abs
,
a
=
a
>
0
?
a
:
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
abs2
()
{
applyUnary
(
unary
::
Abs
<
T
>
());
}
void
BaseMatrixT
<
T
>::
abs2
()
{
applyUnary
(
unary
::
Abs
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Sign
,
a
=
(
a
>
0
)
-
(
a
<
0
));
DEFINE_MATRIX_UNARY_OP
(
Sign
,
a
=
(
a
>
0
)
-
(
a
<
0
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sign2
()
{
applyUnary
(
unary
::
Sign
<
T
>
());
}
void
BaseMatrixT
<
T
>::
sign2
()
{
applyUnary
(
unary
::
Sign
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
zero
()
{
applyUnary
(
unary
::
Zero
<
T
>
());
}
void
BaseMatrixT
<
T
>::
zero
()
{
applyUnary
(
unary
::
Zero
<
T
>
());
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
zeroAtOffset
(
int64_t
columnOffset
,
int64_t
numColumns
)
{
void
BaseMatrixT
<
T
>::
zeroAtOffset
(
int64_t
columnOffset
,
int64_t
numColumns
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
numColumns
;
int
numCols
=
numColumns
;
...
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
...
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
}
}
DEFINE_MATRIX_UNARY_OP
(
One
,
a
=
1
);
DEFINE_MATRIX_UNARY_OP
(
One
,
a
=
1
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
one
()
{
applyUnary
(
unary
::
One
<
T
>
());
}
void
BaseMatrixT
<
T
>::
one
()
{
applyUnary
(
unary
::
One
<
T
>
());
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
a
,
p
));
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
a
,
p
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
pow2
(
real
p
)
{
void
BaseMatrixT
<
real
>::
pow2
(
real
p
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
Pow
<
real
>
(
p
));
applyUnary
(
unary
::
Pow
<
real
>
(
p
));
...
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
...
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
-=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
-=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
subScalar
(
T
p
)
{
applyUnary
(
unary
::
SubScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
subScalar
(
T
p
)
{
applyUnary
(
unary
::
SubScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
*=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
*=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulScalar
(
T
p
)
{
applyUnary
(
unary
::
MulScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
mulScalar
(
T
p
)
{
applyUnary
(
unary
::
MulScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
/=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
/=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divScalar
(
T
p
)
{
applyUnary
(
unary
::
DivScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
divScalar
(
T
p
)
{
applyUnary
(
unary
::
DivScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Assign
,
ONE_PARAMETER
,
a
=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Assign
,
ONE_PARAMETER
,
a
=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assign
(
T
p
)
{
applyUnary
(
unary
::
Assign
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
assign
(
T
p
)
{
applyUnary
(
unary
::
Assign
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add
,
ONE_PARAMETER
,
a
+=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add
,
ONE_PARAMETER
,
a
+=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
T
p
)
{
applyUnary
(
unary
::
Add
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
add
(
T
p
)
{
applyUnary
(
unary
::
Add
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
a
*
p1
+
p2
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
a
*
p1
+
p2
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Add2
<
T
>
(
p1
,
p2
));
}
void
BaseMatrixT
<
T
>::
add
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Add2
<
T
>
(
p1
,
p2
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Clip
,
TWO_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Clip
,
TWO_PARAMETER
,
a
=
a
<
p1
?
p1
:
(
a
>
p2
?
p2
:
a
));
a
=
a
<
p1
?
p1
:
(
a
>
p2
?
p2
:
a
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
clip
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Clip
<
T
>
(
p1
,
p2
));
}
void
BaseMatrixT
<
T
>::
clip
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Clip
<
T
>
(
p1
,
p2
));
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ClipDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ClipDerivative
,
a
=
b
<
p1
?
0
:
(
b
>
p2
?
0
:
1
));
TWO_PARAMETER
,
template
<
class
T
>
a
=
b
<
p1
?
0
:
(
b
>
p2
?
0
:
1
));
template
<
class
T
>
void
BaseMatrixT
<
T
>::
clipDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
clipDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
ClipDerivative
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
ClipDerivative
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
BiggerThanScalar
,
ONE_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
BiggerThanScalar
,
ONE_PARAMETER
,
a
=
a
>
p
?
1.0
f
:
0.0
f
);
a
=
a
>
p
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThanScalar
(
T
p
)
{
void
BaseMatrixT
<
T
>::
biggerThanScalar
(
T
p
)
{
applyUnary
(
unary
::
BiggerThanScalar
<
T
>
(
p
));
applyUnary
(
unary
::
BiggerThanScalar
<
T
>
(
p
));
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DownClip
,
ONE_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DownClip
,
ONE_PARAMETER
,
a
=
a
>
p
?
a
:
p
);
a
=
a
>
p
?
a
:
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
downClip
(
T
p
)
{
void
BaseMatrixT
<
T
>::
downClip
(
T
p
)
{
applyUnary
(
unary
::
DownClip
<
T
>
(
p
));
applyUnary
(
unary
::
DownClip
<
T
>
(
p
));
}
}
...
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
...
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
*/
*/
DEFINE_MATRIX_BINARY_OP
(
Add
,
a
+=
b
);
DEFINE_MATRIX_BINARY_OP
(
Add
,
a
+=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Add
<
T
>
(),
b
);
applyBinary
(
binary
::
Add
<
T
>
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
add
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
add
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Add
<
real
>
(),
b
);
applyBinary
(
binary
::
Add
<
real
>
(),
b
);
...
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
...
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
void
BaseMatrixT
<
T
>::
addAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
...
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
...
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addP2P
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addP2P
(
BaseMatrixT
&
b
)
{
T
*
A
=
data_
;
T
*
A
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
int
dimM
=
height_
;
int
dimM
=
height_
;
int
dimN
=
width_
;
int
dimN
=
width_
;
hl_gpu_apply_binary_op
<
T
,
binary
::
Add
<
T
>
,
0
,
0
>
hl_gpu_apply_binary_op
<
T
,
binary
::
Add
<
T
>
,
0
,
0
>
(
(
binary
::
Add
<
T
>
(),
A
,
B
,
dimM
,
dimN
,
dimN
,
dimN
);
binary
::
Add
<
T
>
(),
A
,
B
,
dimM
,
dimN
,
dimN
,
dimN
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
true_type
()
/* bAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
Add
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add1
,
ONE_PARAMETER
,
a
+=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add1
,
ONE_PARAMETER
,
a
+=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
Add1
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
Add1
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
b
,
p
));
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
b
,
p
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
pow2
(
BaseMatrixT
&
b
,
real
p
)
{
void
BaseMatrixT
<
real
>::
pow2
(
BaseMatrixT
&
b
,
real
p
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Pow
<
real
>
(
p
),
b
);
applyBinary
(
binary
::
Pow
<
real
>
(
p
),
b
);
...
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
...
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
Add2
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Add2
<
T
>
(
p1
,
p2
),
b
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addBias
(
BaseMatrixT
&
b
,
T
scale
)
{
void
BaseMatrixT
<
T
>::
addBias
(
BaseMatrixT
&
b
,
T
scale
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add1
<
T
>
(
scale
),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
Add1
<
T
>
(
scale
),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_OP
(
Sub
,
a
-=
b
);
DEFINE_MATRIX_BINARY_OP
(
Sub
,
a
-=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sub
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sub
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Sub1
,
ONE_PARAMETER
,
a
-=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Sub1
,
ONE_PARAMETER
,
a
-=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
Sub1
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
Sub1
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Relu
,
b
=
a
>
0.0
f
?
a
:
0.0
f
);
DEFINE_MATRIX_BINARY_OP
(
Relu
,
b
=
a
>
0.0
f
?
a
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
relu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Relu
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
relu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Relu
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_OP
(
ReluDerivative
,
a
*=
(
b
>
0.0
f
?
1.0
f
:
0.0
f
));
DEFINE_MATRIX_BINARY_OP
(
ReluDerivative
,
a
*=
(
b
>
0.0
f
?
1.0
f
:
0.0
f
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reluDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ReluDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ReluDerivative
<
T
>
(),
b
);
}
}
...
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
...
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
))));
:
a
))));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softrelu
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
softrelu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Softrelu
<
real
>
(),
b
);
applyBinary
(
binary
::
Softrelu
<
real
>
(),
b
);
}
}
...
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
...
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
a
*=
(
1.0
-
exp
(
-
1.0
*
((
b
>
THRESHOLD
)
a
*=
(
1.0
-
exp
(
-
1.0
*
((
b
>
THRESHOLD
)
?
THRESHOLD
?
THRESHOLD
:
((
b
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
b
)))));
:
((
b
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
b
)))));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softreluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
softreluDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SoftreluDerivative
<
real
>
(),
b
);
applyBinary
(
binary
::
SoftreluDerivative
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Brelu
,
TWO_PARAMETER
,
b
=
a
>
p1
?
a
:
p1
;
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Brelu
,
TWO_PARAMETER
,
b
=
a
>
p1
?
a
:
p1
;
b
=
b
<
p2
?
b
:
p2
);
b
=
b
<
p2
?
b
:
p2
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
brelu
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
brelu
(
BaseMatrixT
&
b
)
{
int
p1
=
0
,
p2
=
24
;
//! TODO(yuyang18): Make p1,p2 configuable.
int
p1
=
0
,
p2
=
24
;
//! TODO(yuyang18): Make p1,p2 configuable.
applyBinary
(
binary
::
Brelu
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Brelu
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
BreluDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
BreluDerivative
,
TWO_PARAMETER
,
a
*=
(
b
>
p1
&&
b
<
p2
)
?
1.0
:
0.0
);
a
*=
(
b
>
p1
&&
b
<
p2
)
?
1.0
:
0.0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
breluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
breluDerivative
(
BaseMatrixT
&
b
)
{
int
p1
=
0
,
p2
=
24
;
int
p1
=
0
,
p2
=
24
;
applyBinary
(
binary
::
BreluDerivative
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
BreluDerivative
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Square
,
b
=
a
*
a
);
DEFINE_MATRIX_BINARY_OP
(
Square
,
b
=
a
*
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
square2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
square2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Square
<
T
>
(),
b
);
applyBinary
(
binary
::
Square
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
SquareDerivative
,
a
*=
2.0
*
b
);
DEFINE_MATRIX_BINARY_OP
(
SquareDerivative
,
a
*=
2.0
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
squareDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
squareDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SquareDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
SquareDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Tanh
,
DEFINE_MATRIX_BINARY_OP
(
Tanh
,
T
tmp
=
-
2.0
*
a
;
T
tmp
=
-
2.0
*
a
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
b
=
2.0
/
(
1.0
+
std
::
exp
(
tmp
))
-
1.0
);
b
=
2.0
/
(
1.0
+
std
::
exp
(
tmp
))
-
1.0
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
tanh
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
tanh
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Tanh
<
real
>
(),
b
);
applyBinary
(
binary
::
Tanh
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
TanhDerivative
,
a
*=
1
-
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
TanhDerivative
,
a
*=
1
-
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
tanhDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
tanhDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
TanhDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
TanhDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanh
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
b
=
p1
*
ScaledTanh
,
TWO_PARAMETER
,
b
=
p1
*
(
2.0
/
(
1.0
+
exp
(
-
2
*
p2
*
a
))
-
1.0
));
(
2.0
/
(
1.0
+
exp
(
-
2
*
p2
*
a
))
-
1.0
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
scaledTanh
(
BaseMatrixT
&
b
,
real
p1
,
real
p2
)
{
void
BaseMatrixT
<
real
>::
scaledTanh
(
BaseMatrixT
&
b
,
real
p1
,
real
p2
)
{
applyBinary
(
binary
::
ScaledTanh
<
real
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
ScaledTanh
<
real
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanhDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanhDerivative
,
TWO_PARAMETER
,
a
*=
p2
*
(
p1
-
b
*
b
));
a
*=
p2
*
(
p1
-
b
*
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
scaledTanhDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
scaledTanhDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
ScaledTanhDerivative
<
T
>
(
p1
*
p1
,
p2
/
p1
),
b
);
applyBinary
(
binary
::
ScaledTanhDerivative
<
T
>
(
p1
*
p1
,
p2
/
p1
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Reciprocal
,
b
=
1.0
f
/
a
);
DEFINE_MATRIX_BINARY_OP
(
Reciprocal
,
b
=
1.0
f
/
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Reciprocal
<
T
>
(),
b
);
applyBinary
(
binary
::
Reciprocal
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
ReciprocalDerivative
,
a
*=
-
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
ReciprocalDerivative
,
a
*=
-
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocalDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reciprocalDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ReciprocalDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ReciprocalDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Abs
,
b
=
a
>
0.0
f
?
a
:
-
a
);
DEFINE_MATRIX_BINARY_OP
(
Abs
,
b
=
a
>
0.0
f
?
a
:
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
abs2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Abs
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
abs2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Abs
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_OP
(
AbsDerivative
,
a
=
(
b
>
0
)
?
a
:
(
b
<
0
)
?
-
a
:
0
);
DEFINE_MATRIX_BINARY_OP
(
AbsDerivative
,
a
=
(
b
>
0
)
?
a
:
(
b
<
0
)
?
-
a
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
absDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
absDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
AbsDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
AbsDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DEFINE_MATRIX_BINARY_OP
(
Sigmoid
,
const
T
THRESHOLD_MIN
=
-
40.0
;
Sigmoid
,
const
T
THRESHOLD_MIN
=
-
40.0
;
const
T
THRESHOLD_MAX
=
13.0
;
const
T
THRESHOLD_MAX
=
13.0
;
T
tmp
=
(
a
<
THRESHOLD_MIN
)
?
THRESHOLD_MIN
T
tmp
=
(
a
<
THRESHOLD_MIN
)
:
((
a
>
THRESHOLD_MAX
)
?
THRESHOLD_MAX
:
a
);
?
THRESHOLD_MIN
b
=
1.0
f
/
(
1.0
f
+
exp
(
-
tmp
)));
:
((
a
>
THRESHOLD_MAX
)
?
THRESHOLD_MAX
:
a
);
template
<
>
b
=
1.0
f
/
(
1.0
f
+
exp
(
-
tmp
)));
template
<
>
void
BaseMatrixT
<
real
>::
sigmoid
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
sigmoid
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Sigmoid
<
real
>
(),
b
);
applyBinary
(
binary
::
Sigmoid
<
real
>
(),
b
);
...
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
...
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_OP
(
SigmoidDerivative
,
a
*=
b
*
(
1
-
b
));
DEFINE_MATRIX_BINARY_OP
(
SigmoidDerivative
,
a
*=
b
*
(
1
-
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sigmoidDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
sigmoidDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SigmoidDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
SigmoidDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
ExpDerivative
,
a
*=
b
);
DEFINE_MATRIX_BINARY_OP
(
ExpDerivative
,
a
*=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
expDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
expDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ExpDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ExpDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Sign
,
b
=
a
>
0.0
f
?
1.0
f
:
-
1.0
f
);
DEFINE_MATRIX_BINARY_OP
(
Sign
,
b
=
a
>
0.0
f
?
1.0
f
:
-
1.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sign2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
sign2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sign
<
T
>
(),
b
);
applyBinary
(
binary
::
Sign
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Exp
,
a
=
exp
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Exp
,
a
=
exp
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
exp2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
exp2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Exp
<
real
>
(),
b
);
applyBinary
(
binary
::
Exp
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Log
,
a
=
log
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Log
,
a
=
log
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
log2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
log2
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Log
<
real
>
(),
b
);
applyBinary
(
binary
::
Log
<
real
>
(),
b
);
...
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
...
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_OP
(
Sqrt
,
a
=
sqrt
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Sqrt
,
a
=
sqrt
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sqrt2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
sqrt2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sqrt
<
real
>
(),
b
);
applyBinary
(
binary
::
Sqrt
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
InvSqrt
,
a
=
1.0
f
/
sqrt
(
b
));
DEFINE_MATRIX_BINARY_OP
(
InvSqrt
,
a
=
1.0
f
/
sqrt
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
invSqrt
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
invSqrt
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
InvSqrt
<
real
>
(),
b
);
applyBinary
(
binary
::
InvSqrt
<
real
>
(),
b
);
...
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
...
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
IsEqual
,
ONE_PARAMETER
,
a
=
(
b
==
p
));
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
IsEqual
,
ONE_PARAMETER
,
a
=
(
b
==
p
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
isEqualTo
(
BaseMatrixT
&
b
,
T
value
)
{
void
BaseMatrixT
<
T
>::
isEqualTo
(
BaseMatrixT
&
b
,
T
value
)
{
applyBinary
(
binary
::
IsEqual
<
T
>
(
value
),
b
);
applyBinary
(
binary
::
IsEqual
<
T
>
(
value
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddScalar
,
ONE_PARAMETER
,
a
=
b
+
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddScalar
,
ONE_PARAMETER
,
a
=
b
+
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
addScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
AddScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
AddScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
=
b
-
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
=
b
-
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
subScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
subScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
SubScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
SubScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
mulScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
MulScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
MulScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
=
b
/
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
=
b
/
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
divScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
DivScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
DivScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScalarDiv
,
ONE_PARAMETER
,
a
=
p
/
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScalarDiv
,
ONE_PARAMETER
,
a
=
p
/
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
scalarDiv
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
scalarDiv
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
ScalarDiv
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
ScalarDiv
<
T
>
(
p
),
b
);
}
}
...
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
...
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropy
,
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropy
,
a
=
-
c
*
log
(
b
)
-
(
1
-
c
)
*
log
(
1
-
b
));
a
=
-
c
*
log
(
b
)
-
(
1
-
c
)
*
log
(
1
-
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
softCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
SoftCrossEntropy
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
SoftCrossEntropy
<
real
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropyBp
,
a
+=
(
b
-
c
)
/
(
b
*
(
1
-
b
)));
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropyBp
,
a
+=
(
b
-
c
)
/
(
b
*
(
1
-
b
)));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
softCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
softCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
SoftCrossEntropyBp
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
SoftCrossEntropyBp
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropy
,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropy
,
a
=
c
>
0.5
?
-
log
(
b
)
:
-
log
(
1.0
-
b
));
a
=
c
>
0.5
?
-
log
(
b
)
:
-
log
(
1.0
-
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
binaryLabelCrossEntropy
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
binaryLabelCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
...
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
...
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropyBp
,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropyBp
,
a
+=
c
>
0.5
?
-
1.0
/
b
:
1.0
/
(
1.0
-
b
));
a
+=
c
>
0.5
?
-
1.0
/
b
:
1.0
/
(
1.0
-
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
binaryLabelCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
binaryLabelCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
BinaryCrossEntropyBp
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
BinaryCrossEntropyBp
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Add
,
a
=
b
+
c
);
DEFINE_MATRIX_TERNARY_OP
(
Add
,
a
=
b
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Add
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Add
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add1
,
TWO_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add1
,
TWO_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
applyTernary
(
ternary
::
Add1
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
Add1
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Sub
,
a
=
b
-
c
);
DEFINE_MATRIX_TERNARY_OP
(
Sub
,
a
=
b
-
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Sub
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Sub
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Sub1
,
TWO_PARAMETER
,
a
=
p1
*
b
-
p2
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Sub1
,
TWO_PARAMETER
,
a
=
p1
*
b
-
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
applyTernary
(
ternary
::
Sub1
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
Sub1
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Add2
,
a
=
a
+
b
+
c
);
DEFINE_MATRIX_TERNARY_OP
(
Add2
,
a
=
a
+
b
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Add2
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Add2
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
+
p3
*
c
);
a
=
p1
*
a
+
p2
*
b
+
p3
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
applyTernary
(
ternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
c
=
p2
*
c
-
p1
*
(
b
+
p3
*
a
);
c
=
p2
*
c
-
p1
*
(
b
+
p3
*
a
);
a
=
a
+
c
);
a
=
a
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad
BaseMatrixT
&
c
,
// mom
BaseMatrixT
&
c
,
// mom
T
p1
,
// learningRate,
T
p1
,
// learningRate,
T
p2
,
// momentum,
T
p2
,
// momentum,
T
p3
)
{
// decayRate
T
p3
)
{
// decayRate
applyTernary
(
ternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
c
=
p2
*
c
-
p1
*
d
*
(
b
+
p3
*
a
);
c
=
p2
*
c
-
p1
*
d
*
(
b
+
p3
*
a
);
a
+=
c
);
a
+=
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad,
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad,
BaseMatrixT
&
c
,
// mom,
BaseMatrixT
&
c
,
// mom,
BaseMatrixT
&
d
,
// lr,
BaseMatrixT
&
d
,
// lr,
T
p1
,
// learningRate,
T
p1
,
// learningRate,
T
p2
,
// momentum,
T
p2
,
// momentum,
T
p3
)
{
// decayRate
T
p3
)
{
// decayRate
applyQuaternary
(
quaternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
...
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
...
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
a
=
(
a
>
lambda
)
a
=
(
a
>
lambda
)
?
(
a
-
lambda
)
?
(
a
-
lambda
)
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL1
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL1
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
applyBinary
(
binary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
),
lr
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
applyL1
(
BaseMatrixT
&
lr
,
void
BaseMatrixT
<
real
>::
applyL1
(
BaseMatrixT
&
lr
,
real
learningRate
,
real
learningRate
,
real
decayRate
)
{
real
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
),
lr
);
}
else
{
}
else
{
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
lr
.
data_
,
learningRate
*
decayRate
,
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
lr
.
data_
,
learningRate
*
decayRate
,
height_
*
width_
);
height_
*
width_
);
}
}
}
}
...
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
...
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
a
=
(
a
>
lambda
)
a
=
(
a
>
lambda
)
?
(
a
-
lambda
)
?
(
a
-
lambda
)
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL1
(
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL1
(
T
learningRate
,
T
decayRate
)
{
applyUnary
(
unary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
));
applyUnary
(
unary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
));
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
applyL1
(
real
learningRate
,
real
decayRate
)
{
void
BaseMatrixT
<
real
>::
applyL1
(
real
learningRate
,
real
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
));
applyUnary
(
unary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
));
}
else
{
}
else
{
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
learningRate
*
decayRate
,
simd
::
decayL1
(
height_
*
width_
);
this
->
data_
,
this
->
data_
,
learningRate
*
decayRate
,
height_
*
width_
);
}
}
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ApplyL2
,
ONE_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ApplyL2
,
ONE_PARAMETER
,
a
*=
(
1.0
f
/
(
1.0
f
+
p
*
b
)));
a
*=
(
1.0
f
/
(
1.0
f
+
p
*
b
)));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL2
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL2
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
ApplyL2
<
T
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL2
<
T
>
(
learningRate
*
decayRate
),
lr
);
...
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
...
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL2
(
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL2
(
T
learningRate
,
T
decayRate
)
{
BaseMatrixT
<
T
>::
mulScalar
(
1.0
f
/
(
1.0
f
+
learningRate
*
decayRate
));
BaseMatrixT
<
T
>::
mulScalar
(
1.0
f
/
(
1.0
f
+
learningRate
*
decayRate
));
}
}
DEFINE_MATRIX_BINARY_OP
(
DotMul
,
a
*=
b
);
DEFINE_MATRIX_BINARY_OP
(
DotMul
,
a
*=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
);
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotMul
,
a
=
b
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotMul
,
a
=
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotDiv
,
a
=
(
b
==
0.0
)
?
0.0
:
b
/
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotDiv
,
a
=
(
b
==
0.0
)
?
0.0
:
b
/
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotDiv
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotDiv
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotDiv2P
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotDiv2P
,
TWO_PARAMETER
,
a
=
(
b
+
p1
)
/
(
c
+
p2
));
a
=
(
b
+
p1
)
/
(
c
+
p2
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotDiv2P
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotDiv2P
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
...
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
...
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
a
=
log
(
1
+
exp
(
a
))
-
a
*
d
);
a
=
log
(
1
+
exp
(
a
))
-
a
*
d
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rankLoss
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
rankLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
...
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
a
=
(
a
>
THRESHOLD
)
a
=
(
a
>
THRESHOLD
)
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
a
=
exp
(
a
);
a
=
(
a
/
(
1
+
a
)
-
d
));
a
=
exp
(
a
);
template
<
>
a
=
(
a
/
(
1
+
a
)
-
d
));
template
<
>
void
BaseMatrixT
<
real
>::
rankLossBp
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
rankLossBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
...
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
?
-
THRESHOLD
?
-
THRESHOLD
:
b
;
:
b
;
a
=
log
(
1
+
exp
(
x
))
-
c
*
x
);
a
=
log
(
1
+
exp
(
x
))
-
c
*
x
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
logisticRegressionLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
logisticRegressionLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
LogisticRegressionLoss
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
LogisticRegressionLoss
<
real
>
(),
b
,
c
);
}
}
...
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
...
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
T
x
=
(
b
>
THRESHOLD
)
?
THRESHOLD
:
(
b
<
-
THRESHOLD
)
T
x
=
(
b
>
THRESHOLD
)
?
THRESHOLD
:
(
b
<
-
THRESHOLD
)
?
-
THRESHOLD
?
-
THRESHOLD
:
b
;
:
b
;
x
=
exp
(
x
);
a
=
x
/
(
1
+
x
)
-
c
);
x
=
exp
(
x
);
template
<
>
a
=
x
/
(
1
+
x
)
-
c
);
template
<
>
void
BaseMatrixT
<
real
>::
logisticRegressionLossBp
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
logisticRegressionLossBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
LogisticRegressionLossBp
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
LogisticRegressionLossBp
<
real
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
BiggerThan
,
a
=
(
b
>
c
)
?
1.0
f
:
0.0
f
);
DEFINE_MATRIX_TERNARY_OP
(
BiggerThan
,
a
=
(
b
>
c
)
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
BiggerThan
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
BiggerThan
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_QUATERNARY_OP
(
DEFINE_MATRIX_QUATERNARY_OP
(
BiggerThan
,
a
=
((
b
>
c
&&
d
>
0.5
f
)
||
(
b
<
c
&&
d
<
0.5
f
))
?
1.0
f
:
0.0
f
);
BiggerThan
,
a
=
((
b
>
c
&&
d
>
0.5
f
)
||
(
b
<
c
&&
d
<
0.5
f
))
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
...
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
}
}
DEFINE_MATRIX_TERNARY_OP
(
Max
,
a
=
(
b
>
c
)
?
b
:
c
);
DEFINE_MATRIX_TERNARY_OP
(
Max
,
a
=
(
b
>
c
)
?
b
:
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
max2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
max2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Max
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Max
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
BinaryClassificationError
,
ONE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
BinaryClassificationError
,
ONE_PARAMETER
,
c
+=
((
a
>
p
)
==
(
b
>
p
))
?
0.0
f
:
1.0
f
);
c
+=
((
a
>
p
)
==
(
b
>
p
))
?
0.0
f
:
1.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
binaryClassificationError2
(
size_t
destCol
,
BaseMatrixT
&
b
,
void
BaseMatrixT
<
T
>::
binaryClassificationError2
(
size_t
destCol
,
BaseMatrixT
&
c
,
T
p
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
destCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
destCol
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
b
.
applyTernary
(
ternary
::
BinaryClassificationError
<
T
>
(
p
),
c
,
*
this
,
numRows
,
b
.
applyTernary
(
ternary
::
BinaryClassificationError
<
T
>
(
p
),
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
c
,
*
this
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
binaryClassificationError
(
size_t
destCol
,
void
BaseMatrixT
<
real
>::
binaryClassificationError
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
...
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
...
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
classificationError
(
p
),
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
base
::
binary
::
classificationError
(
p
),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
true_type
()
/*aAsColVector*/
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
+
p3
*
d
);
a
=
p1
*
b
+
p2
*
c
+
p3
*
d
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add3
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
T
p1
,
void
BaseMatrixT
<
T
>::
add3
(
T
p2
,
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
T
p1
,
T
p2
,
T
p3
)
{
applyQuaternary
(
quaternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotMulSquare
,
a
=
b
*
c
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotMulSquare
,
a
=
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotMulSquare
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotMulSquare
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotSquareSquare
,
a
=
b
*
b
*
c
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotSquareSquare
,
a
=
b
*
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotSquareSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotSquareSquare
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotSquareSquare
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DotMulSquare
,
a
*=
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
DotMulSquare
,
a
*=
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotMulSquare
<
T
>
(),
b
);
applyBinary
(
binary
::
DotMulSquare
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DotSquareMul
,
a
=
a
*
a
*
b
);
DEFINE_MATRIX_BINARY_OP
(
DotSquareMul
,
a
=
a
*
a
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareMul
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotSquareMul
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotSquareMul
<
T
>
(),
b
);
applyBinary
(
binary
::
DotSquareMul
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
AddSquareSum
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
AddSquareSum
,
THREE_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
+
p3
*
d
;
T
tmp
=
p1
*
b
+
p2
*
c
+
p3
*
d
;
a
+=
tmp
*
tmp
);
a
+=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
d
,
void
BaseMatrixT
<
T
>::
addSquareSum
(
T
p1
,
T
p2
,
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
d
,
T
p1
,
T
p2
,
T
p3
)
{
applyQuaternary
(
quaternary
::
AddSquareSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
AddSquareSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddSquare
,
ONE_PARAMETER
,
a
+=
p
*
b
*
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddSquare
,
ONE_PARAMETER
,
a
+=
p
*
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addSquare
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
addSquare
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
AddSquare
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
AddSquare
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DecayAddSquare
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DecayAddSquare
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
b
);
a
=
p1
*
a
+
p2
*
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
decayAddSquare
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
decayAddSquare
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
DecayAddSquare
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
DecayAddSquare
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DecayAddSquareMul
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DecayAddSquareMul
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
b
*
c
*
c
);
a
=
p1
*
a
+
p2
*
b
*
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
decayAddSquareMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
void
BaseMatrixT
<
T
>::
decayAddSquareMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
T
p2
)
{
applyTernary
(
ternary
::
DecayAddSquareMul
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DecayAddSquareMul
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
ReciprocalSum
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
ReciprocalSum
,
THREE_PARAMETER
,
a
=
1
/
(
p1
*
b
+
p2
*
c
+
p3
));
a
=
1
/
(
p1
*
b
+
p2
*
c
+
p3
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocalSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
void
BaseMatrixT
<
T
>::
reciprocalSum
(
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
applyTernary
(
ternary
::
ReciprocalSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
ReciprocalSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Reciprocal2
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Reciprocal2
,
TWO_PARAMETER
,
a
=
1
/
(
p1
*
b
+
p2
));
a
=
1
/
(
p1
*
b
+
p2
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
Reciprocal2
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Reciprocal2
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSquareSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSquareSum
,
TWO_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
;
T
tmp
=
p1
*
b
+
p2
*
c
;
a
*=
tmp
*
tmp
);
a
*=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
void
BaseMatrixT
<
T
>::
dotMulSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
T
p2
)
{
applyTernary
(
ternary
::
DotMulSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotMulSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotSquareSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotSquareSum
,
TWO_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
;
T
tmp
=
p1
*
b
+
p2
*
c
;
a
=
tmp
*
tmp
);
a
=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSum
,
TWO_PARAMETER
,
a
*=
p1
*
b
+
p2
*
c
);
a
*=
p1
*
b
+
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotMulSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotMulSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotMulSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
CopyAndClear
,
b
=
a
;
a
=
0
);
DEFINE_MATRIX_BINARY_OP
(
CopyAndClear
,
b
=
a
;
a
=
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
copyAndClear
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
copyAndClear
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
CopyAndClear
<
T
>
(),
b
);
applyBinary
(
binary
::
CopyAndClear
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
AddDotMul
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
AddDotMul
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
c
);
a
=
p1
*
a
+
p2
*
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
addDotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
AddDotMul
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
AddDotMul
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Assign
,
a
=
b
;);
DEFINE_MATRIX_BINARY_OP
(
Assign
,
a
=
b
;);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assign
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
assign
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Assign
<
T
>
(),
b
);
applyBinary
(
binary
::
Assign
<
T
>
(),
b
);
...
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
...
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assignAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
void
BaseMatrixT
<
T
>::
assignAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
...
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
...
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
}
}
DEFINE_MATRIX_BINARY_OP
(
DeepSwap
,
T
tmp
=
a
;
a
=
b
;
b
=
tmp
);
DEFINE_MATRIX_BINARY_OP
(
DeepSwap
,
T
tmp
=
a
;
a
=
b
;
b
=
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
deepSwap
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
deepSwap
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DeepSwap
<
T
>
(),
b
);
applyBinary
(
binary
::
DeepSwap
<
T
>
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rowDotMul
(
size_t
destCol
,
void
BaseMatrixT
<
real
>::
rowDotMul
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
aggregate
(
aggregate
::
sum
(),
numRows
,
numCols
,
offset
,
false_type
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
true_type
()
/*aAsColVector*/
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowDotMul2
(
size_t
destCol
,
void
BaseMatrixT
<
T
>::
rowDotMul2
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
...
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
...
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
}
}
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
addDotMulVMM
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
addDotMulVMM
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
aggregate
(
aggregate
::
sum
(),
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulVMM2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulVMM2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
}
}
DEFINE_MATRIX_TERNARY_OP
(
addDotMulMMV
,
a
+=
b
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
addDotMulMMV
,
a
+=
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulMMV
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulMMV
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
true_type
()
/*cAsRowVector*/
,
false_type
());
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/*cAsRowVector*/
,
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulMMV2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulMMV2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
rowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
DotMul
<
T
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowScale2
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
rowScale2
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
colScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
colScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
DotMul
<
T
>
(),
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addColScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addColScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addRowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addRowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
RowAdd
,
ONE_PARAMETER
,
a
=
b
+
p
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
RowAdd
,
ONE_PARAMETER
,
a
=
b
+
p
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowAdd
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
void
BaseMatrixT
<
T
>::
rowAdd
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
RowAdd
<
T
>
(
p
),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
RowAdd
<
T
>
(
p
),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
RowPow
,
a
=
pow
(
b
,
c
));
DEFINE_MATRIX_TERNARY_OP
(
RowPow
,
a
=
pow
(
b
,
c
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rowPow
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
rowPow
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
RowPow
<
real
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
RowPow
<
real
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
else
{
}
else
{
size_t
height
=
this
->
height_
;
size_t
height
=
this
->
height_
;
size_t
width
=
this
->
width_
;
size_t
width
=
this
->
width_
;
...
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
mulRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotMul
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_OP
(
DotDiv
,
a
/=
b
);
DEFINE_MATRIX_BINARY_OP
(
DotDiv
,
a
/=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
divRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotDiv
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotDiv
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
mulColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotMul
<
T
>
(),
false_type
(),
true_type
()
/* bAsColVector */
);
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
divColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotDiv
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotDiv
<
T
>
(),
false_type
(),
true_type
()
/* bAsColVector */
);
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
...
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
aggregate
(
agg
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Saver
>
template
<
class
Agg
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
...
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
aggregate
(
agg
,
false_type
(),
true_type
()
/*aAsColVector*/
);
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyRow
(
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Agg
agg
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyRow
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
applyRow
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
}
else
{
}
else
{
...
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
...
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Op
,
class
Saver
>
template
<
class
Agg
,
class
Op
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
BaseMatrixT
<
real
>::
applyRow
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
size_t
numRows
=
b
.
height_
;
size_t
numRows
=
b
.
height_
;
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
...
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
...
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
c
.
height_
,
numRows
);
CHECK_EQ
(
c
.
height_
,
numRows
);
CHECK_EQ
(
c
.
width_
,
numCols
);
CHECK_EQ
(
c
.
width_
,
numCols
);
aggregate
(
agg
,
op
,
sv
,
aggregate
(
agg
,
b
,
c
,
numRows
,
numCols
,
offset
,
op
,
false_type
(),
true_type
()
/*aAsColVector*/
);
sv
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Op
>
template
<
class
Agg
,
class
Op
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Op
op
,
real
scaleDest
,
real
scaleAgg
,
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
Op
op
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyRow
(
agg
,
op
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
,
c
);
applyRow
(
agg
,
op
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
,
c
);
}
else
{
}
else
{
...
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
...
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
...
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
height_
,
1UL
);
CHECK_EQ
(
height_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
aggregate
(
agg
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Saver
>
template
<
class
Agg
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
...
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
height_
,
1UL
);
CHECK_EQ
(
height_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
aggregate
(
agg
,
true_type
()
/*aAsRowVector*/
,
false_type
());
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyCol
(
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Agg
agg
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyCol
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
applyCol
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
}
else
{
}
else
{
...
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
...
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
return
0
;
return
0
;
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumRows
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
void
BaseMatrixT
<
real
>::
sumRows
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
applyRow
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
maxRows
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
maxRows
(
BaseMatrixT
&
b
)
{
applyRow
(
aggregate
::
max
(),
b
);
applyRow
(
aggregate
::
max
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
minRows
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
minRows
(
BaseMatrixT
&
b
)
{
applyRow
(
aggregate
::
min
(),
b
);
applyRow
(
aggregate
::
min
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
maxCols
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
maxCols
(
BaseMatrixT
&
b
)
{
applyCol
(
aggregate
::
max
(),
b
);
applyCol
(
aggregate
::
max
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
minCols
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
minCols
(
BaseMatrixT
&
b
)
{
applyCol
(
aggregate
::
min
(),
b
);
applyCol
(
aggregate
::
min
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumCols
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
void
BaseMatrixT
<
real
>::
sumCols
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
applyCol
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
applyCol
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumOfSquaredDiffs
(
void
BaseMatrixT
<
real
>::
sumOfSquaredDiffs
(
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
real
scaleSum
,
real
scaleDest
)
{
BaseMatrixT
&
c
,
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
squaredDiff
(),
real
scaleSum
,
scaleDest
,
scaleSum
,
b
,
c
);
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
squaredDiff
(),
scaleDest
,
scaleSum
,
b
,
c
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumOfProducts
(
void
BaseMatrixT
<
real
>::
sumOfProducts
(
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
real
scaleSum
,
real
scaleDest
)
{
BaseMatrixT
&
c
,
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
real
scaleSum
,
scaleDest
,
scaleSum
,
b
,
c
);
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
scaleDest
,
scaleSum
,
b
,
c
);
}
}
template
class
BaseMatrixT
<
real
>;
template
class
BaseMatrixT
<
real
>;
...
...
paddle/math/TrainingAlgorithmOp.cu
浏览文件 @
59a8ebc6
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h"
#include "BaseMatrix.h"
#include "TrainingAlgorithmOp.h"
#include "TrainingAlgorithmOp.h"
#include "paddle/utils/Logging.h"
#if __cplusplus > 199711L
#if __cplusplus > 199711L
...
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
...
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
real
tau
,
real
tau
,
real
learningRate
)
{
real
learningRate
)
{
auto
expr1
=
momU
.
lazyAssign
(
momU
-
(
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr1
=
momU
.
lazyAssign
(
momU
-
(
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr2
=
momV
.
lazyAssign
(
auto
expr2
=
momV
+
(
tau
*
alpha
*
gamma
*
learningRate
)
*
grad
);
momV
.
lazyAssign
(
momV
+
(
tau
*
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
(
tau
/
beta
+
(
real
)
1
/
alpha
)
*
momU
+
(
tau
/
beta
+
(
real
)
1
/
alpha
)
*
momU
+
((
real
)
1
/
beta
)
*
momV
);
((
real
)
1
/
beta
)
*
momV
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
...
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
real
momentum
,
real
momentum
,
real
decayRate
)
{
real
decayRate
)
{
auto
expr1
=
accum
.
lazyAssign
(
rou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
auto
expr1
=
accum
.
lazyAssign
(
rou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
auto
expr2
=
lr
.
lazyAssign
(
auto
expr2
=
((
accum_update
+
epsilon
)
/
(
accum
+
epsilon
)).
sqrt
());
lr
.
lazyAssign
(
((
accum_update
+
epsilon
)
/
(
accum
+
epsilon
)).
sqrt
());
auto
expr3
=
accum_update
.
lazyAssign
(
auto
expr3
=
accum_update
.
lazyAssign
(
rou
*
accum_update
+
rou
*
accum_update
+
((
real
)
1
-
rou
)
*
(
grad
*
lr
).
square
());
((
real
)
1
-
rou
)
*
(
grad
*
lr
).
square
());
auto
expr4
=
mom
.
lazyAssign
(
auto
expr4
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
...
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
...
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
real
momentum
,
real
momentum
,
real
decayRate
)
{
real
decayRate
)
{
auto
expr1
=
accum
.
lazyAssign
(
accum
+
grad
.
square
());
auto
expr1
=
accum
.
lazyAssign
(
accum
+
grad
.
square
());
auto
expr2
=
lr
.
lazyAssign
(
auto
expr2
=
(
accum_buffer
+
accum
+
epsilon
).
sqrt
().
reciprocal
());
lr
.
lazyAssign
(
(
accum_buffer
+
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
mom
.
lazyAssign
(
auto
expr3
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
...
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
...
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
bool
firstTime
)
{
bool
firstTime
)
{
auto
expr2
=
f
.
lazyAssign
(
accumulatedRou
*
f
+
((
real
)
1
-
rou
)
*
grad
);
auto
expr2
=
f
.
lazyAssign
(
accumulatedRou
*
f
+
((
real
)
1
-
rou
)
*
grad
);
auto
expr3
=
lr
.
lazyAssign
((
g
-
f
.
square
()
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
lr
.
lazyAssign
((
g
-
f
.
square
()
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr4
=
mom
.
lazyAssign
(
auto
expr4
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
if
(
firstTime
)
{
if
(
firstTime
)
{
...
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
...
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
}
else
{
}
else
{
auto
expr1
=
g
.
lazyAssign
(
auto
expr1
=
accumulatedRou
*
g
+
((
real
)
1
-
rou
)
*
grad
.
square
());
g
.
lazyAssign
(
accumulatedRou
*
g
+
((
real
)
1
-
rou
)
*
grad
.
square
());
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
}
}
...
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
...
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
real
decayRate
,
real
decayRate
,
bool
firstTime
)
{
bool
firstTime
)
{
auto
expr2
=
lr
.
lazyAssign
((
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr2
=
lr
.
lazyAssign
((
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
mom
.
lazyAssign
(
auto
expr3
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
if
(
firstTime
)
{
if
(
firstTime
)
{
...
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
...
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
}
else
{
}
else
{
auto
expr1
=
accum
.
lazyAssign
(
auto
expr1
=
accum
.
lazyAssign
(
accumulatedRou
*
accum
+
accumulatedRou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
((
real
)
1
-
rou
)
*
grad
.
square
());
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
}
}
...
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
...
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
real
beta2_power
,
real
beta2_power
,
real
epsilon
,
real
epsilon
,
real
learningRate
)
{
real
learningRate
)
{
real
alpha
=
learningRate
*
real
alpha
=
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
learningRate
*
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr2
=
v
.
lazyAssign
(
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
());
auto
expr2
=
v
.
lazyAssign
(
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
());
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
value
-
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
));
value
-
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
));
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
...
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
int64_t
step
,
int64_t
step
,
real
alpha
)
{
real
alpha
)
{
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr2
=
u
.
lazyAssign
(
auto
expr2
=
(
beta2
*
u
>
grad
.
abs
()).
condition
(
beta2
*
u
,
grad
.
abs
()));
u
.
lazyAssign
(
(
beta2
*
u
>
grad
.
abs
()).
condition
(
beta2
*
u
,
grad
.
abs
()));
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
value
-
(
alpha
/
((
real
)
1
-
(
real
)
std
::
pow
(
beta1
,
step
)))
*
(
mom
/
u
));
value
-
(
alpha
/
((
real
)
1
-
(
real
)
std
::
pow
(
beta1
,
step
)))
*
(
mom
/
u
));
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
...
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
real
beta2_power
,
real
beta2_power
,
real
epsilon
,
real
epsilon
,
real
learningRate
)
{
real
learningRate
)
{
real
alpha
=
learningRate
*
real
alpha
=
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
learningRate
*
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
mom
=
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
;
mom
=
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
;
...
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
...
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
v
=
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
();
v
=
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
();
value
-=
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
);
value
-=
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
);
}
}
void
adamaxApply
(
BaseMatrix
&
value
,
void
adamaxApply
(
BaseMatrix
&
value
,
...
...
paddle/math/tests/test_Tensor.cu
浏览文件 @
59a8ebc6
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
#include "TensorCheck.h"
#include "TensorCheck.h"
#include "paddle/math/Matrix.h"
using
paddle
::
Matrix
;
using
paddle
::
Matrix
;
using
paddle
::
CpuMatrix
;
using
paddle
::
CpuMatrix
;
...
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
...
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
using
autotest
::
TensorCheckEqual
;
using
autotest
::
TensorCheckEqual
;
using
autotest
::
TensorCheckErr
;
using
autotest
::
TensorCheckErr
;
#define INIT_UNARY(A1, A2)
\
#define INIT_UNARY(A1, A2) \
Tensor A1(height, width);
\
Tensor A1(height, width);
\
Tensor A2(height, width);
\
Tensor A2(height, width);
\
A1.randomizeUniform();
\
A1.randomizeUniform();
\
A2.copyFrom(A1)
A2.copyFrom(A1)
#define INIT_BINARY(A1, A2, B)
\
#define INIT_BINARY(A1, A2, B) \
INIT_UNARY(A1, A2);
\
INIT_UNARY(A1, A2);
\
Tensor B(height, width);
\
Tensor B(height, width);
\
B.randomizeUniform()
B.randomizeUniform()
#define INIT_TERNARY(A1, A2, B, C)
\
#define INIT_TERNARY(A1, A2, B, C) \
INIT_BINARY(A1, A2, B);
\
INIT_BINARY(A1, A2, B);
\
Tensor C(height, width);
\
Tensor C(height, width);
\
C.randomizeUniform()
C.randomizeUniform()
#define INIT_QUATERNARY(A1, A2, B, C, D)
\
#define INIT_QUATERNARY(A1, A2, B, C, D) \
INIT_TERNARY(A1, A2, B, C);
\
INIT_TERNARY(A1, A2, B, C);
\
Tensor D(height, width);
\
Tensor D(height, width);
\
D.randomizeUniform()
D.randomizeUniform()
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestUnaryMatrix
{
struct
TestUnaryMatrix
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
...
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
...
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestBinaryMatrix
{
struct
TestBinaryMatrix
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
>
BinaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
>
BinaryFunc
;
...
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
...
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestTernaryMatrix
{
struct
TestTernaryMatrix
{
typedef
std
::
function
<
void
(
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
>
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
>
TernaryFunc
;
TernaryFunc
;
explicit
TestTernaryMatrix
(
TernaryFunc
testTernaryFunc
)
{
explicit
TestTernaryMatrix
(
TernaryFunc
testTernaryFunc
)
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
...
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
...
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestQuaternaryMatrix
{
struct
TestQuaternaryMatrix
{
typedef
std
::
function
<
void
(
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
>
QuaternaryFunc
;
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
>
QuaternaryFunc
;
explicit
TestQuaternaryMatrix
(
QuaternaryFunc
testQuaternaryFunc
)
{
explicit
TestQuaternaryMatrix
(
QuaternaryFunc
testQuaternaryFunc
)
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
...
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
...
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
}
}
};
};
template
<
typename
Tensor
,
class
T
>
template
<
typename
Tensor
,
class
T
>
struct
TestUnaryVectorT
{
struct
TestUnaryVectorT
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
...
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
...
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
}
}
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAddScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAddScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.0
;
real
p2
=
3.0
;
A1
.
add
(
p1
);
// a += p
A1
.
add
(
p1
);
// a += p
A2
+=
p1
;
A2
+=
p1
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
...
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSubScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSubScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
subScalar
(
p
);
// a -= p
A1
.
subScalar
(
p
);
// a -= p
...
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
...
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMulScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorMulScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
mulScalar
(
p
);
// a *= p
A1
.
mulScalar
(
p
);
// a *= p
...
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
...
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDivScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorDivScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
divScalar
(
p
);
// a /= p
A1
.
divScalar
(
p
);
// a /= p
...
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
...
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorNeg
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorNeg
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
neg
();
// a = -a
A1
.
neg
();
// a = -a
A2
=
-
A2
;
A2
=
-
A2
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
abs2
();
// a = a > 0 ? a : -a
A1
.
abs2
();
// a = a > 0 ? a : -a
A2
=
A2
.
abs
();
A2
=
A2
.
abs
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
square2
();
// a = a * a
A1
.
square2
();
// a = a * a
A2
=
A2
.
square
();
A2
=
A2
.
square
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
reciprocal2
();
// a = 1.0f / a
A1
.
reciprocal2
();
// a = 1.0f / a
A2
=
A2
.
reciprocal
();
A2
=
A2
.
reciprocal
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
sign2
();
// a = (a > 0) - (a < 0)
A1
.
sign2
();
// a = (a > 0) - (a < 0)
A2
=
A2
.
sign
();
A2
=
A2
.
sign
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
assign
(
1.5
);
// a = p
A1
.
assign
(
1.5
);
// a = p
A2
=
A2
.
constant
(
1.5
);
A2
=
A2
.
constant
(
1.5
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
...
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorAddScalar
(
A1
,
A2
);
testTensorAddScalar
(
A1
,
A2
);
testTensorSubScalar
(
A1
,
A2
);
testTensorSubScalar
(
A1
,
A2
);
...
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
...
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
testTensorAssign
(
A1
,
A2
);
testTensorAssign
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnaryBaseOpInt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnaryBaseOpInt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
add
(
2
);
// a += p
A1
.
add
(
2
);
// a += p
A2
+=
2
;
A2
+=
2
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
...
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
TEST
(
Unary
,
BaseOp
)
{
TEST
(
Unary
,
BaseOp
)
{
TestUnaryMatrix
<
CpuMatrix
>
testCpuMatrix
(
testUnaryBaseOp
<
CpuMatrix
>
);
TestUnaryMatrix
<
CpuMatrix
>
testCpuMatrix
(
testUnaryBaseOp
<
CpuMatrix
>
);
TestUnaryVectorT
<
CpuVector
,
real
>
testCpuVector
(
testUnaryBaseOp
<
CpuVector
>
);
TestUnaryVectorT
<
CpuVector
,
real
>
testCpuVector
(
testUnaryBaseOp
<
CpuVector
>
);
TestUnaryVectorT
<
CpuIVector
,
int
>
TestUnaryVectorT
<
CpuIVector
,
int
>
testCpuIVector
(
testCpuIVector
(
testUnaryBaseOpInt
<
CpuIVector
>
);
testUnaryBaseOpInt
<
CpuIVector
>
);
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TestUnaryMatrix
<
GpuMatrix
>
testGpuMatrix
(
testUnaryBaseOp
<
GpuMatrix
>
);
TestUnaryMatrix
<
GpuMatrix
>
testGpuMatrix
(
testUnaryBaseOp
<
GpuMatrix
>
);
TestUnaryVectorT
<
GpuVector
,
real
>
testGpuVector
(
testUnaryBaseOp
<
GpuVector
>
);
TestUnaryVectorT
<
GpuVector
,
real
>
testGpuVector
(
testUnaryBaseOp
<
GpuVector
>
);
TestUnaryVectorT
<
GpuIVector
,
int
>
TestUnaryVectorT
<
GpuIVector
,
int
>
testGpuIVector
(
testGpuIVector
(
testUnaryBaseOpInt
<
GpuIVector
>
);
testUnaryBaseOpInt
<
GpuIVector
>
);
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
exp2
();
// a = exp(a)
A1
.
exp2
();
// a = exp(a)
A2
=
A2
.
exp
();
A2
=
A2
.
exp
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
log2
();
// a = log(a)
A1
.
log2
();
// a = log(a)
A2
=
A2
.
log
();
A2
=
A2
.
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
sqrt2
();
// a = sqrt(a)
A1
.
sqrt2
();
// a = sqrt(a)
A2
=
A2
.
sqrt
();
A2
=
A2
.
sqrt
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
pow2
(
3.2
);
// a = pow(a, p)
A1
.
pow2
(
3.2
);
// a = pow(a, p)
A2
=
A2
.
pow
(
3.2
);
A2
=
A2
.
pow
(
3.2
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnayrMathOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnayrMathOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorExp
(
A1
,
A2
);
testTensorExp
(
A1
,
A2
);
testTensorLog
(
A1
,
A2
);
testTensorLog
(
A1
,
A2
);
...
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
...
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorClip
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorClip
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p1
=
0.003
f
;
real
p1
=
0.003
f
;
real
p2
=
0.877
f
;
real
p2
=
0.877
f
;
...
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
...
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThanScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorBiggerThanScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
0.5
f
;
real
p
=
0.5
f
;
A1
.
biggerThanScalar
(
p
);
// a = a > p ? 1.0f : 0.0f
A1
.
biggerThanScalar
(
p
);
// a = a > p ? 1.0f : 0.0f
...
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
...
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
)
{
/**
/**
* T lambda = p;
* T lambda = p;
...
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
...
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
real
learningRate
=
0.7
f
;
real
learningRate
=
0.7
f
;
real
decayRate
=
0.6
f
;
real
decayRate
=
0.6
f
;
A1
.
applyL1
(
learningRate
,
decayRate
);
A1
.
applyL1
(
learningRate
,
decayRate
);
A2
=
(
A2
>
(
learningRate
*
decayRate
)).
condition
(
A2
=
(
A2
>
(
learningRate
*
decayRate
))
(
A2
-
(
learningRate
*
decayRate
)),
.
condition
(
(
A2
<
-
(
learningRate
*
decayRate
)).
condition
(
(
A2
-
(
learningRate
*
decayRate
)),
(
A2
+
(
learningRate
*
decayRate
)),
(
real
)
0.0
));
(
A2
<
-
(
learningRate
*
decayRate
))
.
condition
((
A2
+
(
learningRate
*
decayRate
)),
(
real
)
0.0
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnayrCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnayrCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorClip
(
A1
,
A2
);
testTensorClip
(
A1
,
A2
);
testTensorBiggerThanScalar
(
A1
,
A2
);
testTensorBiggerThanScalar
(
A1
,
A2
);
...
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
...
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.2
;
real
p2
=
3.2
;
...
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
sub
(
B
);
// a -= b
A1
.
sub
(
B
);
// a -= b
...
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
mulScalar
(
B
,
p
);
// a = b * p
A1
.
mulScalar
(
B
,
p
);
// a = b * p
...
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
divScalar
(
B
,
p
);
// a = b / p
A1
.
divScalar
(
B
,
p
);
// a = b / p
...
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
assign
(
B
);
// a = b
A1
.
assign
(
B
);
// a = b
A2
=
B
;
A2
=
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
square2
(
A1
);
// b = a * a
B
.
square2
(
A1
);
// b = a * a
A2
=
B
.
square
();
A2
=
B
.
square
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquareDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSquareDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
squareDerivative
(
B
);
// a *= 2.0 * b
A1
.
squareDerivative
(
B
);
// a *= 2.0 * b
A2
=
A2
*
(
real
)
2.0
*
B
;
A2
=
A2
*
(
real
)
2.0
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
reciprocal2
(
A1
);
// b = 1.0f / a
B
.
reciprocal2
(
A1
);
// b = 1.0f / a
A2
=
B
.
reciprocal
();
A2
=
B
.
reciprocal
();
...
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
real
learningRate
=
0.7
f
;
real
learningRate
=
0.7
f
;
real
decayRate
=
1.2
f
;
real
decayRate
=
1.2
f
;
A1
.
applyL2
(
B
,
learningRate
,
decayRate
);
// a *= (1.0f / (1.0f + p * b))
A1
.
applyL2
(
B
,
learningRate
,
decayRate
);
// a *= (1.0f / (1.0f + p * b))
A2
*=
(
B
.
constant
(
1.0
f
)
+
A2
*=
(
B
.
constant
(
1.0
f
)
+
B
.
constant
(
learningRate
*
decayRate
)
*
B
)
B
.
constant
(
learningRate
*
decayRate
)
*
B
)
.
reciprocal
();
.
reciprocal
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocalDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReciprocalDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
reciprocalDerivative
(
B
);
// a *= -b * b
A1
.
reciprocalDerivative
(
B
);
// a *= -b * b
A2
*=
(
-
B
)
*
B
;
A2
*=
(
-
B
)
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
sign2
(
A1
);
// b = a > 0.0f ? 1.0f : -1.0f
B
.
sign2
(
A1
);
// b = a > 0.0f ? 1.0f : -1.0f
A2
=
B
.
sign
();
A2
=
B
.
sign
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
abs2
(
A1
);
// b = a > 0.0f ? a : -a
B
.
abs2
(
A1
);
// b = a > 0.0f ? a : -a
A2
=
B
.
abs
();
A2
=
B
.
abs
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
testTensorAdd
(
A1
,
A2
,
B
);
testTensorAdd
(
A1
,
A2
,
B
);
testTensorSub
(
A1
,
A2
,
B
);
testTensorSub
(
A1
,
A2
,
B
);
...
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
...
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = exp(b)
// a = exp(b)
A1
.
exp2
(
B
);
A1
.
exp2
(
B
);
...
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExpDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorExpDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
expDerivative
(
B
);
// a *= b
A1
.
expDerivative
(
B
);
// a *= b
A2
*=
B
;
A2
*=
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = log(b)
// a = log(b)
A1
.
log2
(
B
);
A1
.
log2
(
B
);
...
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = sqrt(b)
// a = sqrt(b)
A1
.
sqrt2
(
B
);
A1
.
sqrt2
(
B
);
...
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorInvSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorInvSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = 1.0f / sqrt(b)
// a = 1.0f / sqrt(b)
A1
.
invSqrt
(
B
);
A1
.
invSqrt
(
B
);
...
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
pow2
(
B
,
2.5
f
);
// a = pow(b, p)
A1
.
pow2
(
B
,
2.5
f
);
// a = pow(b, p)
A2
=
B
.
pow
(
2.5
f
);
A2
=
B
.
pow
(
2.5
f
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSoftrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* const T THRESHOLD = 40.0;
* const T THRESHOLD = 40.0;
...
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
A2
=
(
B
.
constant
(
1.0
f
)
+
A2
=
(
B
.
constant
(
1.0
f
)
+
(
B
>
THRESHOLD
).
condition
(
(
B
>
THRESHOLD
)
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
)).
exp
()).
log
();
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
))
.
exp
())
.
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSoftreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* const T THRESHOLD = 40.0;
* const T THRESHOLD = 40.0;
...
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
*/
*/
A1
.
softreluDerivative
(
B
);
A1
.
softreluDerivative
(
B
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
A2
=
A2
*
(
B
.
constant
(
1.0
f
)
-
A2
=
A2
*
(
B
.
constant
(
-
1.0
f
)
*
(
B
.
constant
(
1.0
f
)
-
(
B
>
THRESHOLD
).
condition
(
(
B
.
constant
(
-
1.0
f
)
*
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
))).
exp
());
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
)))
.
exp
());
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSigmoid
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSigmoid
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
const T THRESHOLD_MIN = -40.0;
const T THRESHOLD_MIN = -40.0;
...
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
const
real
THRESHOLD_MIN
=
-
40.0
;
const
real
THRESHOLD_MIN
=
-
40.0
;
const
real
THRESHOLD_MAX
=
13.0
;
const
real
THRESHOLD_MAX
=
13.0
;
auto
tmp
=
(
B
<
THRESHOLD_MIN
).
condition
(
auto
tmp
=
(
B
<
THRESHOLD_MIN
)
THRESHOLD_MIN
,
(
B
>
THRESHOLD_MAX
).
condition
(
THRESHOLD_MAX
,
B
));
.
condition
(
THRESHOLD_MIN
,
(
B
>
THRESHOLD_MAX
).
condition
(
THRESHOLD_MAX
,
B
));
A2
=
(
B
.
constant
(
1.0
f
)
+
(
-
tmp
).
exp
()).
reciprocal
();
A2
=
(
B
.
constant
(
1.0
f
)
+
(
-
tmp
).
exp
()).
reciprocal
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSigmoidDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSigmoidDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
sigmoidDerivative
(
B
);
// a *= b * (1 - b)
A1
.
sigmoidDerivative
(
B
);
// a *= b * (1 - b)
A2
*=
B
*
(
B
.
constant
(
1.0
f
)
-
B
);
A2
*=
B
*
(
B
.
constant
(
1.0
f
)
-
B
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
tanh
(
A1
);
// b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
B
.
tanh
(
A1
);
// b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
A2
=
B
.
constant
(
2.0
f
)
/
((
B
*
((
real
)
-
2.0
f
)).
exp
()
+
(
real
)
1.0
f
)
-
(
real
)
1.0
f
;
A2
=
B
.
constant
(
2.0
f
)
/
((
B
*
((
real
)
-
2.0
f
)).
exp
()
+
(
real
)
1.0
f
)
-
(
real
)
1.0
f
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
tanhDerivative
(
B
);
// a *= 1 - b * b
A1
.
tanhDerivative
(
B
);
// a *= 1 - b * b
A2
*=
B
.
constant
(
1.0
f
)
-
B
*
B
;
A2
*=
B
.
constant
(
1.0
f
)
-
B
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorScaledTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorScaledTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.1
;
real
p2
=
3.1
;
// b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
// b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
B
.
scaledTanh
(
A1
,
p1
,
p2
);
B
.
scaledTanh
(
A1
,
p1
,
p2
);
A2
=
B
.
constant
(
p1
)
*
A2
=
B
.
constant
(
p1
)
*
(
B
.
constant
(
2.0
f
)
/
((
B
.
constant
(
-
2.0
f
)
*
p2
*
B
).
exp
()
+
(
real
)
1.0
)
(
B
.
constant
(
2.0
f
)
/
((
B
.
constant
(
-
2.0
f
)
*
p2
*
B
).
exp
()
+
(
real
)
1.0
)
-
-
(
real
)
1.0
);
(
real
)
1.0
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorScaledTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorScaledTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.1
;
real
p2
=
3.1
;
...
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryMathOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryMathOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
testTensorTanhDerivative
(
A1
,
A2
,
B
);
testTensorTanhDerivative
(
A1
,
A2
,
B
);
testTensorScaledTanhDerivative
(
A1
,
A2
,
B
);
testTensorScaledTanhDerivative
(
A1
,
A2
,
B
);
...
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
...
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorRelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
relu
(
A1
);
// b = a > 0.0f ? a : 0.0f
B
.
relu
(
A1
);
// b = a > 0.0f ? a : 0.0f
A2
=
(
B
>
(
real
)
0.0
f
).
condition
(
B
,
(
real
)
0.0
f
);
A2
=
(
B
>
(
real
)
0.0
f
).
condition
(
B
,
(
real
)
0.0
f
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
reluDerivative
(
B
);
// a *= (b > 0.0f ? 1.0f : 0.0f)
A1
.
reluDerivative
(
B
);
// a *= (b > 0.0f ? 1.0f : 0.0f)
A2
*=
(
B
>
(
real
)
0.0
).
condition
((
real
)
1.0
,
(
real
)
0.0
);
A2
*=
(
B
>
(
real
)
0.0
).
condition
((
real
)
1.0
,
(
real
)
0.0
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorBrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* b = a > p1 ? a : p1
* b = a > p1 ? a : p1
...
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorBreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
SetTensorValue
(
B
,
32.0
f
);
SetTensorValue
(
B
,
32.0
f
);
/*
/*
...
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbsDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAbsDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
absDerivative
(
B
);
// a = (b > 0) ? a : (b < 0) ? -a : 0
A1
.
absDerivative
(
B
);
// a = (b > 0) ? a : (b < 0) ? -a : 0
A2
=
(
B
>
(
real
)
0.0
f
)
.
condition
(
A2
,
A2
=
(
B
>
(
real
)
0.0
f
)
(
B
<
(
real
)
0.0
f
).
condition
(
-
A2
,
(
real
)
0.0
f
));
.
condition
(
A2
,
(
B
<
(
real
)
0.0
f
).
condition
(
-
A2
,
(
real
)
0.0
f
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorIsEqualTo
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorIsEqualTo
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
0.613
;
real
p
=
0.613
;
SetTensorValue
(
B
,
p
);
SetTensorValue
(
B
,
p
);
...
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/**
/**
* T lambda = p * b;
* T lambda = p * b;
...
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
real
decayRate
=
0.6
f
;
real
decayRate
=
0.6
f
;
A1
.
applyL1
(
B
,
learningRate
,
decayRate
);
A1
.
applyL1
(
B
,
learningRate
,
decayRate
);
auto
lambda
=
B
.
constant
(
learningRate
*
decayRate
)
*
B
;
auto
lambda
=
B
.
constant
(
learningRate
*
decayRate
)
*
B
;
A2
=
(
A2
>
lambda
).
condition
(
A2
=
(
A2
>
lambda
)
(
A2
-
lambda
),
(
A2
<
-
lambda
).
condition
((
A2
+
lambda
),
(
real
)
0.0
f
));
.
condition
((
A2
-
lambda
),
(
A2
<
-
lambda
).
condition
((
A2
+
lambda
),
(
real
)
0.0
f
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
subScalar
(
0.5
f
);
B
.
subScalar
(
0.5
f
);
SetTensorValue
(
B
,
0.0
f
);
SetTensorValue
(
B
,
0.0
f
);
...
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
...
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
add
(
B
,
C
);
// a = b + c
A1
.
add
(
B
,
C
);
// a = b + c
A2
=
B
+
C
;
A2
=
B
+
C
;
...
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
sub
(
B
,
C
);
// a = b - c
A1
.
sub
(
B
,
C
);
// a = b - c
A2
=
B
-
C
;
A2
=
B
-
C
;
...
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
dotMul
(
B
,
C
);
// a = b * c
A1
.
dotMul
(
B
,
C
);
// a = b * c
A2
=
B
*
C
;
A2
=
B
*
C
;
...
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
dotDiv
(
B
,
C
);
// a = (b == 0.0) ? 0.0 : b / c
A1
.
dotDiv
(
B
,
C
);
// a = (b == 0.0) ? 0.0 : b / c
A2
=
(
B
==
(
real
)
0.0
).
condition
((
real
)
0.0
,
B
/
C
);
A2
=
(
B
==
(
real
)
0.0
).
condition
((
real
)
0.0
,
B
/
C
);
...
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
real
p1
=
1.5
;
real
p1
=
1.5
;
real
p2
=
2.5
;
real
p2
=
2.5
;
...
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorSoftCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
softCrossEntropy
(
B
,
C
);
// a = -c * log(b) - (1 - c) * log(1 - b)
A1
.
softCrossEntropy
(
B
,
C
);
// a = -c * log(b) - (1 - c) * log(1 - b)
A2
=
-
C
*
B
.
log
()
-
(
C
.
constant
(
1.0
f
)
-
C
)
*
(
B
.
constant
(
1.0
f
)
-
B
).
log
();
A2
=
-
C
*
B
.
log
()
-
(
C
.
constant
(
1.0
f
)
-
C
)
*
(
B
.
constant
(
1.0
f
)
-
B
).
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftCrossEntropyBp
(
Tensor
&
A1
,
void
testTensorSoftCrossEntropyBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
...
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTernaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTernaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
testTensorAdd
(
A1
,
A2
,
B
,
C
);
testTensorAdd
(
A1
,
A2
,
B
,
C
);
testTensorSub
(
A1
,
A2
,
B
,
C
);
testTensorSub
(
A1
,
A2
,
B
,
C
);
...
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
...
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBinaryLabelCrossEntropy
(
Tensor
&
A1
,
void
testTensorBinaryLabelCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
Tensor
&
C
)
{
Tensor
&
C
)
{
A1
.
binaryLabelCrossEntropy
(
B
,
C
);
// a = c > 0.5 ? -log(b) : -log(1.0 - b)
A1
.
binaryLabelCrossEntropy
(
B
,
C
);
// a = c > 0.5 ? -log(b) : -log(1.0 - b)
A2
=
(
C
>
(
real
)
0.5
).
condition
(
A2
=
(
C
>
(
real
)
0.5
).
condition
(
-
(
B
.
log
()),
-
((
B
.
constant
(
1.0
f
)
-
B
).
log
()));
-
(
B
.
log
()),
-
((
B
.
constant
(
1.0
f
)
-
B
).
log
()));
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBinaryLabelCrossEntropyBp
(
Tensor
&
A1
,
void
testTensorBinaryLabelCrossEntropyBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
Tensor
&
C
)
{
Tensor
&
C
)
{
// a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
// a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
A1
.
binaryLabelCrossEntropyBp
(
B
,
C
);
A1
.
binaryLabelCrossEntropyBp
(
B
,
C
);
A2
+=
(
C
>
(
real
)
0.5
).
condition
(
A2
+=
(
C
>
(
real
)
0.5
)
(
B
.
constant
(
-
1.0
f
)
/
B
),
(
B
.
constant
(
1.0
f
)
-
B
).
reciprocal
());
.
condition
((
B
.
constant
(
-
1.0
f
)
/
B
),
(
B
.
constant
(
1.0
f
)
-
B
).
reciprocal
());
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLogisticRegressionLoss
(
Tensor
&
A1
,
void
testTensorLogisticRegressionLoss
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
...
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
*/
*/
A1
.
logisticRegressionLoss
(
B
,
C
);
A1
.
logisticRegressionLoss
(
B
,
C
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
(
B
>
THRESHOLD
).
condition
(
auto
tmp
=
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
A2
=
(
C
.
constant
(
1.0
f
)
+
tmp
.
exp
()).
log
()
-
C
*
tmp
;
A2
=
(
C
.
constant
(
1.0
f
)
+
tmp
.
exp
()).
log
()
-
C
*
tmp
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLogisticRegressionLossBp
(
Tensor
&
A1
,
void
testTensorLogisticRegressionLossBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
...
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
*/
*/
A1
.
logisticRegressionLossBp
(
B
,
C
);
A1
.
logisticRegressionLossBp
(
B
,
C
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
(
B
>
THRESHOLD
).
condition
(
auto
tmp
=
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
auto
tmp2
=
tmp
.
exp
();
auto
tmp2
=
tmp
.
exp
();
A2
=
tmp2
/
(
C
.
constant
(
1.0
)
+
tmp2
)
-
C
;
A2
=
tmp2
/
(
C
.
constant
(
1.0
)
+
tmp2
)
-
C
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThan
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorBiggerThan
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
biggerThan
(
B
,
C
);
// a = (b > c) ? 1.0f : 0.0f
A1
.
biggerThan
(
B
,
C
);
// a = (b > c) ? 1.0f : 0.0f
A2
=
(
B
>
C
).
condition
((
real
)
1.0
f
,
(
real
)
0.0
f
);
A2
=
(
B
>
C
).
condition
((
real
)
1.0
f
,
(
real
)
0.0
f
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMax
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorMax
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
max2
(
B
,
C
);
// a = (b > c) ? b : c
A1
.
max2
(
B
,
C
);
// a = (b > c) ? b : c
A2
=
(
B
>
C
).
condition
(
B
,
C
);
A2
=
(
B
>
C
).
condition
(
B
,
C
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTernaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTernaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
testTensorBinaryLabelCrossEntropyBp
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropyBp
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropy
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropy
(
A1
,
A2
,
B
,
C
);
...
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
...
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testQuaternaryAdd
(
Tensor
&
A1
,
void
testQuaternaryAdd
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
// A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d
// A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d
// A2 = B * 1.5f + C * 2.5f + D * 3.5f;
// A2 = B * 1.5f + C * 2.5f + D * 3.5f;
// TensorCheckEqual(A1, A2);
// TensorCheckEqual(A1, A2);
...
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
...
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThan
(
Tensor
&
A1
,
void
testTensorBiggerThan
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
// a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
// a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
A1
.
biggerThan
(
B
,
C
,
D
);
A1
.
biggerThan
(
B
,
C
,
D
);
A2
=
((
B
>
C
&&
D
>
(
real
)
0.5
)
A2
=
((
B
>
C
&&
D
>
(
real
)
0.5
)
||
(
B
<
C
&&
D
<
(
real
)
0.5
))
||
(
B
<
C
&&
D
<
(
real
)
0.5
))
.
condition
((
real
)
1.0
,
(
real
)
0.0
);
.
condition
((
real
)
1.0
,
(
real
)
0.0
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRankLoss
(
Tensor
&
A1
,
void
testTensorRankLoss
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
/**
/**
* const T THRESHOLD = 40.0; a = b - c;
* const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD)
* a = (a > THRESHOLD)
...
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
...
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
B
-
C
;
auto
tmp
=
B
-
C
;
auto
tmp2
=
(
tmp
>
THRESHOLD
).
condition
(
auto
tmp2
=
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
(
tmp
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
A2
=
(
D
.
constant
(
1.0
f
)
+
tmp2
.
exp
()).
log
()
-
tmp2
*
D
;
A2
=
(
D
.
constant
(
1.0
f
)
+
tmp2
.
exp
()).
log
()
-
tmp2
*
D
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRankLossBp
(
Tensor
&
A1
,
void
testTensorRankLossBp
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
/**
/**
* const T THRESHOLD = 40.0; a = b - c;
* const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD)
* a = (a > THRESHOLD)
...
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
...
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
A1
.
rankLossBp
(
B
,
C
,
D
);
A1
.
rankLossBp
(
B
,
C
,
D
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
B
-
C
;
auto
tmp
=
B
-
C
;
auto
tmp2
=
(
tmp
>
THRESHOLD
).
condition
(
auto
tmp2
=
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
(
tmp
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
auto
tmp3
=
tmp2
.
exp
();
auto
tmp3
=
tmp2
.
exp
();
A2
=
tmp3
/
(
D
.
constant
(
1.0
f
)
+
tmp3
)
-
D
;
A2
=
tmp3
/
(
D
.
constant
(
1.0
f
)
+
tmp3
)
-
D
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testQuaternaryCompareOp
(
Tensor
&
A1
,
void
testQuaternaryCompareOp
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
testTensorBiggerThan
(
A1
,
A2
,
B
,
C
,
D
);
testTensorBiggerThan
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLoss
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLoss
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLossBp
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLossBp
(
A1
,
A2
,
B
,
C
,
D
);
...
...
paddle/math/tests/test_lazyAssign.cu
浏览文件 @
59a8ebc6
...
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
...
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "PerfUtils.h"
#include "TensorCheck.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/TensorAssign.h"
#include "paddle/math/TensorAssign.h"
#include "TensorCheck.h"
#include "PerfUtils.h"
using
paddle
::
BaseMatrix
;
using
paddle
::
BaseMatrix
;
using
paddle
::
CpuMatrix
;
using
paddle
::
CpuMatrix
;
...
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
...
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
typedef
std
::
function
<
void
(
int
height
,
int
width
)
>
testMatrixFunc
;
typedef
std
::
function
<
void
(
int
height
,
int
width
)
>
testMatrixFunc
;
void
testMatrixCase
(
testMatrixFunc
matrixFunc
)
{
void
testMatrixCase
(
testMatrixFunc
matrixFunc
)
{
for
(
auto
height
:
{
1
})
{
for
(
auto
height
:
{
1
})
{
for
(
auto
width
:
{
1
,
32
,
64
,
128
,
512
,
1024
,
4096
,
32768
,
65536
,
131072
,
for
(
auto
width
:
{
1
,
262144
,
524288
,
1048576
,
2097152
,
4194304
,
8388608
})
{
32
,
64
,
128
,
512
,
1024
,
4096
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
,
4194304
,
8388608
})
{
matrixFunc
(
height
,
width
);
matrixFunc
(
height
,
width
);
}
}
}
}
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testLazyAssign
(
int
height
,
int
width
)
{
void
testLazyAssign
(
int
height
,
int
width
)
{
Tensor
A1
(
height
,
width
);
Tensor
A1
(
height
,
width
);
Tensor
A2
(
height
,
width
);
Tensor
A2
(
height
,
width
);
...
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
...
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
EXPRESSION_PERFORMANCE
(
A1
=
B
+
C
;
A1
=
A1
*
D
;);
EXPRESSION_PERFORMANCE
(
A1
=
B
+
C
;
A1
=
A1
*
D
;);
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
auto
expr1
=
A2
.
lazyAssign
(
B
+
C
);
auto
expr1
=
A2
.
lazyAssign
(
B
+
C
);
auto
expr2
=
A2
.
lazyAssign
(
A2
*
D
);
auto
expr2
=
A2
.
lazyAssign
(
A2
*
D
);
AssignEvaluate
(
expr1
,
expr2
););
AssignEvaluate
(
expr1
,
expr2
););
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
TEST
(
lazyAssign
,
CPU
)
{
TEST
(
lazyAssign
,
CPU
)
{
testMatrixCase
(
testLazyAssign
<
CpuMatrix
>
);
}
testMatrixCase
(
testLazyAssign
<
CpuMatrix
>
);
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TEST
(
lazyAssign
,
GPU
)
{
TEST
(
lazyAssign
,
GPU
)
{
testMatrixCase
(
testLazyAssign
<
GpuMatrix
>
);
}
testMatrixCase
(
testLazyAssign
<
GpuMatrix
>
);
}
#endif
#endif
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
sgdUpdateTensor
(
Tensor
&
A
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
,
void
sgdUpdateTensor
(
real
p1
,
real
p2
,
real
p3
)
{
Tensor
&
A
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
,
real
p1
,
real
p2
,
real
p3
)
{
C
=
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
;
C
=
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
;
A
+=
C
;
A
+=
C
;
}
}
void
sgdUpdateLazyAssign
(
BaseMatrix
&
A
,
BaseMatrix
&
B
,
void
sgdUpdateLazyAssign
(
BaseMatrix
&
A
,
BaseMatrix
&
C
,
BaseMatrix
&
D
,
BaseMatrix
&
B
,
real
p1
,
real
p2
,
real
p3
)
{
BaseMatrix
&
C
,
BaseMatrix
&
D
,
real
p1
,
real
p2
,
real
p3
)
{
auto
expr1
=
C
.
lazyAssign
(
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
);
auto
expr1
=
C
.
lazyAssign
(
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
);
auto
expr2
=
A
.
lazyAssign
(
A
+
C
);
auto
expr2
=
A
.
lazyAssign
(
A
+
C
);
AssignEvaluate
(
expr1
,
expr2
);
AssignEvaluate
(
expr1
,
expr2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testSgdUpdate
(
int
height
,
int
width
)
{
void
testSgdUpdate
(
int
height
,
int
width
)
{
Tensor
A1
(
height
,
width
);
Tensor
A1
(
height
,
width
);
Tensor
A2
(
height
,
width
);
Tensor
A2
(
height
,
width
);
...
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
...
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
* a = a + c;
* a = a + c;
*/
*/
// BaseMatrix API
// BaseMatrix API
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
A1
.
sgdUpdate
(
B
,
C1
,
D
,
p1
,
p2
,
p3
););
A1
.
sgdUpdate
(
B
,
C1
,
D
,
p1
,
p2
,
p3
););
// Tensor expression
// Tensor expression
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
sgdUpdateTensor
(
A2
,
B
,
C2
,
D
,
p1
,
p2
,
p3
));
sgdUpdateTensor
(
A2
,
B
,
C2
,
D
,
p1
,
p2
,
p3
));
// lazyAssign
// lazyAssign
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
sgdUpdateLazyAssign
(
A3
,
B
,
C3
,
D
,
p1
,
p2
,
p3
));
sgdUpdateLazyAssign
(
A3
,
B
,
C3
,
D
,
p1
,
p2
,
p3
));
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A3
);
TensorCheckErr
(
A1
,
A3
);
...
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
...
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
TensorCheckErr
(
C1
,
C3
);
TensorCheckErr
(
C1
,
C3
);
}
}
TEST
(
sgdUpdate
,
CPU
)
{
TEST
(
sgdUpdate
,
CPU
)
{
testMatrixCase
(
testSgdUpdate
<
CpuMatrix
>
);
}
testMatrixCase
(
testSgdUpdate
<
CpuMatrix
>
);
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TEST
(
sgdUpdate
,
GPU
)
{
TEST
(
sgdUpdate
,
GPU
)
{
testMatrixCase
(
testSgdUpdate
<
GpuMatrix
>
);
}
testMatrixCase
(
testSgdUpdate
<
GpuMatrix
>
);
}
#endif
#endif
paddle/math/tests/test_matrixCompare.cpp
浏览文件 @
59a8ebc6
...
@@ -1146,7 +1146,7 @@ void testBatch2seqPadding(int batchSize, int inputDim) {
...
@@ -1146,7 +1146,7 @@ void testBatch2seqPadding(int batchSize, int inputDim) {
IVectorPtr
cpuSequence
;
IVectorPtr
cpuSequence
;
generateSequenceStartPositions
(
batchSize
,
cpuSequence
);
generateSequenceStartPositions
(
batchSize
,
cpuSequence
);
for
(
int
i
=
0
;
i
<
cpuSequence
->
getSize
(
);
++
i
)
{
for
(
int
i
=
0
;
i
<
int
(
cpuSequence
->
getSize
()
);
++
i
)
{
(
cpuSequence
->
getData
())[
i
]
+=
1
;
// so no way that maxSeqLen is 0;
(
cpuSequence
->
getData
())[
i
]
+=
1
;
// so no way that maxSeqLen is 0;
}
}
...
...
paddle/operators/.clang-format
0 → 100644
浏览文件 @
59a8ebc6
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
paddle/operators/CMakeLists.txt
浏览文件 @
59a8ebc6
...
@@ -63,5 +63,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
...
@@ -63,5 +63,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
op_library
(
fc_op
op_library
(
fc_op
SRCS fc_op.cc
SRCS fc_op.cc
DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op
)
DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op
)
op_library
(
recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net_op
)
op_library
(
recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS op_desc tensor op_registry operator net_op
)
cc_test
(
recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op
)
cc_test
(
recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op
)
paddle/operators/add_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,10 +18,10 @@ namespace paddle {
...
@@ -18,10 +18,10 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
AddOp
:
public
OperatorWithKernel
{
class
AddOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"Input size of AddOp must be two"
);
PADDLE_ENFORCE
_EQ
(
ctx
.
InputSize
(),
2
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Output size of AddOp must be one"
);
PADDLE_ENFORCE
_EQ
(
ctx
.
OutputSize
(),
1
);
PADDLE_ENFORCE
(
ctx
.
InputVar
(
0
)
!=
nullptr
&&
ctx
.
InputVar
(
1
)
!=
nullptr
,
PADDLE_ENFORCE
(
ctx
.
InputVar
(
0
)
!=
nullptr
&&
ctx
.
InputVar
(
1
)
!=
nullptr
,
"Inputs of AddOp must all be set"
);
"Inputs of AddOp must all be set"
);
PADDLE_ENFORCE
(
ctx
.
OutputVar
(
0
)
!=
nullptr
,
PADDLE_ENFORCE
(
ctx
.
OutputVar
(
0
)
!=
nullptr
,
...
@@ -33,7 +33,7 @@ protected:
...
@@ -33,7 +33,7 @@ protected:
};
};
class
AddOpMaker
:
public
OpProtoAndCheckerMaker
{
class
AddOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
AddOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
AddOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The first input of add op"
);
AddInput
(
"X"
,
"The first input of add op"
);
...
@@ -48,7 +48,7 @@ The equation is: Out = X + Y
...
@@ -48,7 +48,7 @@ The equation is: Out = X + Y
};
};
class
AddOpGrad
:
public
OperatorWithKernel
{
class
AddOpGrad
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{}
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{}
};
};
...
...
paddle/operators/add_op.h
浏览文件 @
59a8ebc6
...
@@ -20,7 +20,7 @@ namespace operators {
...
@@ -20,7 +20,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
AddKernel
:
public
OpKernel
{
class
AddKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
input0
=
context
.
Input
<
Tensor
>
(
0
);
auto
input0
=
context
.
Input
<
Tensor
>
(
0
);
auto
input1
=
context
.
Input
<
Tensor
>
(
1
);
auto
input1
=
context
.
Input
<
Tensor
>
(
1
);
...
...
paddle/operators/cross_entropy_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,7 +18,7 @@ namespace paddle {
...
@@ -18,7 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
OnehotCrossEntropyOp
:
public
OperatorWithKernel
{
class
OnehotCrossEntropyOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"Input size of OnehotCrossEntropyOp must be two"
);
"Input size of OnehotCrossEntropyOp must be two"
);
...
@@ -36,8 +36,19 @@ protected:
...
@@ -36,8 +36,19 @@ protected:
}
}
};
};
class
OnehotCrossEntropyGradientOp
:
public
OperatorWithKernel
{
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
auto
X_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
X
=
ctx
.
Input
<
Tensor
>
(
"X"
);
// TODO(superjom) add enforce here after helper functions ready
X_grad
->
Resize
(
X
->
dims
());
}
};
class
OnehotCrossEntropyOpMaker
:
public
OpProtoAndCheckerMaker
{
class
OnehotCrossEntropyOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
OnehotCrossEntropyOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
OnehotCrossEntropyOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The first input of OnehotCrossEntropyOp"
);
AddInput
(
"X"
,
"The first input of OnehotCrossEntropyOp"
);
...
@@ -54,8 +65,11 @@ OnehotCrossEntropy Operator.
...
@@ -54,8 +65,11 @@ OnehotCrossEntropy Operator.
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_OP
(
onehot_cross_entropy
,
REGISTER_OP
(
onehot_cross_entropy
,
ops
::
OnehotCrossEntropyOp
,
ops
::
OnehotCrossEntropyOp
,
ops
::
OnehotCrossEntropyOpMaker
);
ops
::
OnehotCrossEntropyOpMaker
);
REGISTER_OP_CPU_KERNEL
(
onehot_cross_entropy
,
REGISTER_OP_CPU_KERNEL
(
onehot_cross_entropy
,
ops
::
OnehotCrossEntropyOpKernel
<
ops
::
CPUPlace
,
float
>
);
ops
::
OnehotCrossEntropyOpKernel
<
ops
::
CPUPlace
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
onehot_cross_entropy_grad
,
ops
::
OnehotCrossEntropyGradientOpKernel
<
ops
::
CPUPlace
,
float
>
);
paddle/operators/cross_entropy_op.h
浏览文件 @
59a8ebc6
...
@@ -18,28 +18,53 @@ limitations under the License. */
...
@@ -18,28 +18,53 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
static
const
float
kCrossEntropyLogThreshold
{
1e-20
};
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
OnehotCrossEntropyOpKernel
:
public
OpKernel
{
class
OnehotCrossEntropyOpKernel
:
public
OpKernel
{
public:
public:
constexpr
T
LOG_THRESHOLD
()
const
{
return
static_cast
<
T
>
(
1e-20
);
}
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
auto
X
=
ctx
.
Input
<
Tensor
>
(
0
);
auto
X
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
T
*
X
_
data
=
X
->
data
<
T
>
();
const
T
*
Xdata
=
X
->
data
<
T
>
();
const
int
*
label_data
=
ctx
.
Input
<
Tensor
>
(
1
)
->
data
<
int
>
();
const
int
*
label_data
=
ctx
.
Input
<
Tensor
>
(
1
)
->
data
<
int
>
();
auto
Y
=
ctx
.
Output
<
Tensor
>
(
0
);
auto
Y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
Y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
Y
_
data
=
Y
->
data
<
T
>
();
T
*
Ydata
=
Y
->
data
<
T
>
();
int
batch_size
=
X
->
dims
()[
0
];
int
batch_size
=
X
->
dims
()[
0
];
int
class_num
=
X
->
dims
()[
1
];
int
class_num
=
X
->
dims
()[
1
];
// Y[i] = -log(X[i][j])
// Y[i] = -log(X[i][j])
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
Y_data
[
i
]
=
-
std
::
log
(
Ydata
[
i
]
=
-
std
::
log
(
std
::
max
(
Xdata
[
i
*
class_num
+
label_data
[
i
]],
std
::
max
(
X_data
[
i
*
class_num
+
label_data
[
i
]],
LOG_THRESHOLD
()));
kCrossEntropyLogThreshold
));
}
}
};
template
<
typename
Place
,
typename
T
>
class
OnehotCrossEntropyGradientOpKernel
:
public
OpKernel
{
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
auto
X
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
dX
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
dY
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
label
=
ctx
.
Input
<
Tensor
>
(
"label"
);
auto
*
dXdata
=
dX
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
());
auto
*
dYdata
=
dY
->
template
data
<
T
>();
auto
*
Xdata
=
X
->
template
data
<
T
>();
auto
*
label_data
=
label
->
data
<
int
>
();
const
int
batch_size
=
X
->
dims
()[
0
];
const
int
class_num
=
X
->
dims
()[
1
];
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
dXdata
[
i
*
class_num
+
label_data
[
i
]]
=
-
dYdata
[
i
]
/
std
::
max
(
Xdata
[
i
*
class_num
+
label_data
[
i
]],
kCrossEntropyLogThreshold
);
}
}
}
}
};
};
...
...
paddle/operators/fc_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,31 +18,29 @@ namespace paddle {
...
@@ -18,31 +18,29 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
FullyConnectedOp
:
public
NetOp
{
class
FullyConnectedOp
:
public
NetOp
{
public:
public:
void
Init
()
override
{
void
Init
()
override
{
AddOp
(
OpRegistry
::
CreateOp
(
"mul"
,
AddOp
(
OpRegistry
::
CreateOp
(
"mul"
,
{
{
Input
(
"X"
),
Input
(
"W"
),
Input
(
"X"
),
Input
(
"W"
),
},
},
{
Output
(
"before_act"
)},
{
Output
(
"before_act"
)},
{}));
{}));
auto
b
=
Input
(
"b"
);
auto
b
=
Input
(
"b"
);
if
(
b
!=
framework
::
kEmptyVarName
)
{
if
(
b
!=
framework
::
kEmptyVarName
)
{
AddOp
(
OpRegistry
::
CreateOp
(
"rowwise_add"
,
AddOp
(
OpRegistry
::
CreateOp
(
"rowwise_add"
,
{
Output
(
"before_act"
),
Input
(
"b"
)},
{
Output
(
"before_act"
),
Input
(
"b"
)},
{
Output
(
"before_act"
)},
{
Output
(
"before_act"
)},
{}));
{}));
}
}
auto
activation
=
GetAttr
<
std
::
string
>
(
"activation"
);
auto
activation
=
GetAttr
<
std
::
string
>
(
"activation"
);
AddOp
(
OpRegistry
::
CreateOp
(
AddOp
(
OpRegistry
::
CreateOp
(
activation
,
{
Output
(
"before_act"
)},
activation
,
{
Output
(
"before_act"
)},
{
Output
(
"Y"
)},
{}));
{
Output
(
"Y"
)},
{}));
CompleteAddOp
(
false
);
CompleteAddOp
(
false
);
}
}
};
};
class
FullyConnectedOpMaker
:
public
OpProtoAndCheckerMaker
{
class
FullyConnectedOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
FullyConnectedOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
FullyConnectedOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"the input of fc operator"
);
AddInput
(
"X"
,
"the input of fc operator"
);
...
...
paddle/operators/fill_zeros_like_op.cc
浏览文件 @
59a8ebc6
...
@@ -20,7 +20,7 @@ namespace paddle {
...
@@ -20,7 +20,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
FillZerosLikeOp
:
public
framework
::
OperatorWithKernel
{
class
FillZerosLikeOp
:
public
framework
::
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
framework
::
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
framework
::
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1UL
,
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1UL
,
"Input size of FillZerosLikeOp must be one."
);
"Input size of FillZerosLikeOp must be one."
);
...
@@ -36,7 +36,7 @@ protected:
...
@@ -36,7 +36,7 @@ protected:
};
};
class
FillZerosLikeOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
FillZerosLikeOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
FillZerosLikeOpMaker
(
framework
::
OpProto
*
proto
,
FillZerosLikeOpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
framework
::
OpAttrChecker
*
op_checker
)
:
framework
::
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
framework
::
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
...
@@ -52,8 +52,7 @@ The output will have the same size with input.
...
@@ -52,8 +52,7 @@ The output will have the same size with input.
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_OP
(
fill_zeros_like
,
REGISTER_OP
(
fill_zeros_like
,
paddle
::
operators
::
FillZerosLikeOp
,
paddle
::
operators
::
FillZerosLikeOp
,
paddle
::
operators
::
FillZerosLikeOpMaker
);
paddle
::
operators
::
FillZerosLikeOpMaker
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
fill_zeros_like
,
fill_zeros_like
,
...
...
paddle/operators/fill_zeros_like_op.h
浏览文件 @
59a8ebc6
...
@@ -22,7 +22,7 @@ namespace operators {
...
@@ -22,7 +22,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
FillZerosLikeKernel
:
public
framework
::
OpKernel
{
class
FillZerosLikeKernel
:
public
framework
::
OpKernel
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
output
=
context
.
Output
<
framework
::
Tensor
>
(
0
);
auto
*
output
=
context
.
Output
<
framework
::
Tensor
>
(
0
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
...
...
paddle/operators/mean_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,7 +18,7 @@ namespace paddle {
...
@@ -18,7 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
MeanOp
:
public
OperatorWithKernel
{
class
MeanOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1
,
"Input size of AddOp must be one"
);
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1
,
"Input size of AddOp must be one"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Output size of AddOp must be one"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Output size of AddOp must be one"
);
...
@@ -29,7 +29,7 @@ protected:
...
@@ -29,7 +29,7 @@ protected:
};
};
class
MeanOpMaker
:
public
OpProtoAndCheckerMaker
{
class
MeanOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
MeanOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
MeanOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of mean op"
);
AddInput
(
"X"
,
"The input of mean op"
);
...
@@ -39,7 +39,7 @@ public:
...
@@ -39,7 +39,7 @@ public:
};
};
class
MeanGradOp
:
public
OperatorWithKernel
{
class
MeanGradOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
ctx
.
Output
<
Tensor
>
(
"X"
+
framework
::
kGradVarSuffix
)
ctx
.
Output
<
Tensor
>
(
"X"
+
framework
::
kGradVarSuffix
)
->
Resize
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
());
->
Resize
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
());
...
...
paddle/operators/mean_op.h
浏览文件 @
59a8ebc6
...
@@ -20,7 +20,7 @@ namespace operators {
...
@@ -20,7 +20,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
MeanKernel
:
public
OpKernel
{
class
MeanKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
input
=
context
.
Input
<
Tensor
>
(
0
);
auto
input
=
context
.
Input
<
Tensor
>
(
0
);
auto
output
=
context
.
Output
<
Tensor
>
(
0
);
auto
output
=
context
.
Output
<
Tensor
>
(
0
);
...
@@ -37,7 +37,7 @@ public:
...
@@ -37,7 +37,7 @@ public:
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
MeanGradKernel
:
public
OpKernel
{
class
MeanGradKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
OG
=
context
.
Input
<
Tensor
>
(
"Out"
+
framework
::
kGradVarSuffix
);
auto
OG
=
context
.
Input
<
Tensor
>
(
"Out"
+
framework
::
kGradVarSuffix
);
PADDLE_ENFORCE
(
framework
::
product
(
OG
->
dims
())
==
1
,
PADDLE_ENFORCE
(
framework
::
product
(
OG
->
dims
())
==
1
,
...
...
paddle/operators/mul_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,23 +18,27 @@ namespace paddle {
...
@@ -18,23 +18,27 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
MulOp
:
public
OperatorWithKernel
{
class
MulOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"The mul op must take two inputs"
);
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"The mul op must take two inputs"
);
auto
dim0
=
ctx
.
Input
<
Tensor
>
(
0
)
->
dims
();
auto
dim0
=
ctx
.
Input
<
Tensor
>
(
0
)
->
dims
();
auto
dim1
=
ctx
.
Input
<
Tensor
>
(
1
)
->
dims
();
auto
dim1
=
ctx
.
Input
<
Tensor
>
(
1
)
->
dims
();
PADDLE_ENFORCE
(
dim0
.
size
()
==
2
&&
dim1
.
size
()
==
2
,
PADDLE_ENFORCE_EQ
(
dim0
.
size
(),
2
,
"The input of mul op must be matrix"
);
"input X(%s) should be a tensor with 2 dims, a matrix"
,
PADDLE_ENFORCE
(
ctx
.
op_
.
Input
(
"X"
));
dim0
[
1
]
==
dim1
[
0
],
PADDLE_ENFORCE_EQ
(
dim1
.
size
(),
2
,
"input Y(%s) should be a tensor with 2 dims, a matrix"
,
ctx
.
op_
.
Input
(
"Y"
));
PADDLE_ENFORCE_EQ
(
dim0
[
1
],
dim1
[
0
],
"First matrix's width must be equal with second matrix's height."
);
"First matrix's width must be equal with second matrix's height."
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"The mul op must take
one output"
);
PADDLE_ENFORCE
_EQ
(
ctx
.
OutputSize
(),
1
,
"The mul op takes only
one output"
);
ctx
.
Output
<
Tensor
>
(
0
)
->
Resize
({
dim0
[
0
],
dim1
[
1
]});
ctx
.
Output
<
Tensor
>
(
0
)
->
Resize
({
dim0
[
0
],
dim1
[
1
]});
}
}
};
};
class
MulOpMaker
:
public
OpProtoAndCheckerMaker
{
class
MulOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
MulOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
MulOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The first input of mul op"
);
AddInput
(
"X"
,
"The first input of mul op"
);
...
@@ -49,7 +53,7 @@ The equation is: Out = X * Y
...
@@ -49,7 +53,7 @@ The equation is: Out = X * Y
};
};
class
MulOpGrad
:
public
OperatorWithKernel
{
class
MulOpGrad
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{}
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{}
std
::
string
DebugString
()
const
override
{
std
::
string
DebugString
()
const
override
{
LOG
(
INFO
)
<<
"MulGrad"
;
LOG
(
INFO
)
<<
"MulGrad"
;
...
...
paddle/operators/mul_op.h
浏览文件 @
59a8ebc6
...
@@ -21,7 +21,7 @@ namespace operators {
...
@@ -21,7 +21,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
MulKernel
:
public
OpKernel
{
class
MulKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
Eigen
::
array
<
Eigen
::
IndexPair
<
Eigen
::
DenseIndex
>
,
1
>
dim_pair
=
{
Eigen
::
array
<
Eigen
::
IndexPair
<
Eigen
::
DenseIndex
>
,
1
>
dim_pair
=
{
{
Eigen
::
IndexPair
<
Eigen
::
DenseIndex
>
(
1
,
0
)}};
{
Eigen
::
IndexPair
<
Eigen
::
DenseIndex
>
(
1
,
0
)}};
...
...
paddle/operators/net_op.h
浏览文件 @
59a8ebc6
...
@@ -40,7 +40,7 @@ namespace operators {
...
@@ -40,7 +40,7 @@ namespace operators {
* it defines.
* it defines.
*/
*/
class
NetOp
:
public
framework
::
OperatorBase
{
class
NetOp
:
public
framework
::
OperatorBase
{
public:
public:
/**
/**
* Infer all the operators' input and output variables' shapes, will be called
* Infer all the operators' input and output variables' shapes, will be called
* before every mini-batch
* before every mini-batch
...
@@ -90,7 +90,7 @@ public:
...
@@ -90,7 +90,7 @@ public:
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
ops_
;
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
ops_
;
private:
private:
bool
add_op_done_
{
false
};
bool
add_op_done_
{
false
};
template
<
typename
T
,
typename
KeyType
>
template
<
typename
T
,
typename
KeyType
>
...
...
paddle/operators/net_op_test.cc
浏览文件 @
59a8ebc6
...
@@ -12,7 +12,7 @@ static int infer_shape_cnt = 0;
...
@@ -12,7 +12,7 @@ static int infer_shape_cnt = 0;
static
int
run_cnt
=
0
;
static
int
run_cnt
=
0
;
class
TestOp
:
public
OperatorBase
{
class
TestOp
:
public
OperatorBase
{
public:
public:
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
override
{
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
override
{
++
infer_shape_cnt
;
++
infer_shape_cnt
;
}
}
...
@@ -23,7 +23,7 @@ public:
...
@@ -23,7 +23,7 @@ public:
};
};
class
EmptyOp
:
public
OperatorBase
{
class
EmptyOp
:
public
OperatorBase
{
public:
public:
void
InferShape
(
const
Scope
&
scope
)
const
override
{}
void
InferShape
(
const
Scope
&
scope
)
const
override
{}
void
Run
(
const
Scope
&
scope
,
void
Run
(
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{}
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{}
...
...
paddle/operators/recurrent_op.cc
浏览文件 @
59a8ebc6
...
@@ -25,214 +25,75 @@
...
@@ -25,214 +25,75 @@
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
rnn
{
void
SegmentInputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
inlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
)
{
PADDLE_ENFORCE
(
!
inlinks
.
empty
(),
"no in links are provided."
);
for
(
size_t
i
=
0
;
i
<
inlinks
.
size
();
++
i
)
{
auto
input_var
=
step_scopes
[
0
]
->
FindVar
(
inlinks
[
i
].
external
);
PADDLE_ENFORCE
(
input_var
!=
nullptr
,
"input link [%s] is not in scope."
,
inlinks
[
i
].
external
);
Tensor
*
input
=
input_var
->
GetMutable
<
Tensor
>
();
framework
::
DDim
dims
=
input
->
dims
();
PADDLE_ENFORCE
(
static_cast
<
size_t
>
(
dims
[
0
])
==
seq_len
,
"all the inlinks must have same length"
);
framework
::
DDim
step_dims
=
slice_ddim
(
dims
,
1
,
dims
.
size
());
for
(
size_t
j
=
0
;
j
<
seq_len
;
j
++
)
{
Tensor
*
step_input
=
step_scopes
[
j
]
->
NewVar
(
inlinks
[
i
].
internal
)
->
GetMutable
<
Tensor
>
();
if
(
!
infer_shape_mode
)
{
*
step_input
=
input
->
Slice
<
float
>
(
j
,
j
+
1
);
}
step_input
->
Resize
(
step_dims
);
}
}
}
void
ConcatOutputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
outlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
)
{
for
(
size_t
i
=
0
;
i
<
outlinks
.
size
();
i
++
)
{
auto
output_var
=
step_scopes
[
0
]
->
FindVar
(
outlinks
[
i
].
external
);
PADDLE_ENFORCE
(
output_var
!=
nullptr
,
"output link [%s] is not in scope."
,
outlinks
[
i
].
external
);
Tensor
*
output
=
output_var
->
GetMutable
<
Tensor
>
();
if
(
infer_shape_mode
)
{
framework
::
DDim
step_dims
=
step_scopes
[
0
]
->
FindVar
(
outlinks
[
i
].
internal
)
->
GetMutable
<
Tensor
>
()
->
dims
();
std
::
vector
<
int
>
dims_vec
=
vectorize
(
step_dims
);
dims_vec
.
insert
(
dims_vec
.
begin
(),
seq_len
);
output
->
Resize
(
framework
::
make_ddim
(
dims_vec
));
}
else
{
output
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
size_t
j
=
0
;
j
<
seq_len
;
j
++
)
{
Tensor
*
step_output
=
step_scopes
[
j
]
->
FindVar
(
outlinks
[
i
].
internal
)
->
GetMutable
<
Tensor
>
();
// TODO(luotao02) data type and platform::DeviceContext() should set
// correctly
(
output
->
Slice
<
float
>
(
j
,
j
+
1
))
.
CopyFrom
<
float
>
(
*
step_output
,
platform
::
CPUPlace
());
}
}
}
}
void
LinkMemories
(
const
std
::
vector
<
Scope
*>&
scopes
,
const
std
::
vector
<
rnn
::
MemoryAttr
>&
memories
,
const
size_t
step_id
,
const
int
offset
,
bool
infer_shape_mode
)
{
PADDLE_ENFORCE
(
step_id
<
scopes
.
size
(),
"step [%d] is out of range of step scopes' size [%d]"
,
step_id
,
scopes
.
size
());
PADDLE_ENFORCE
(
static_cast
<
int
>
(
step_id
)
+
offset
>=
0
,
"offset [%d] must be large than -[%d]"
,
offset
,
step_id
);
PADDLE_ENFORCE
(
step_id
+
offset
<
scopes
.
size
(),
"offset [%d] is out of range, it must be less than (%d - %d)"
,
offset
,
scopes
.
size
(),
step_id
);
auto
scope
=
scopes
[
step_id
];
auto
linked_scope
=
scopes
[
step_id
+
offset
];
for
(
auto
&
attr
:
memories
)
{
auto
mem
=
scope
->
FindVar
(
attr
.
pre_var
)
->
GetMutable
<
Tensor
>
();
auto
linked_mem
=
linked_scope
->
FindVar
(
attr
.
var
)
->
GetMutable
<
Tensor
>
();
if
(
infer_shape_mode
)
{
mem
->
Resize
(
linked_mem
->
dims
());
}
else
{
mem
->
ShareDataWith
<
float
>
(
*
linked_mem
);
}
}
}
void
InitArgument
(
const
ArgumentName
&
name
,
Argument
*
arg
,
const
OperatorBase
&
op
)
{
arg
->
step_net
=
op
.
Input
(
name
.
step_net
);
arg
->
step_scopes
=
op
.
Output
(
name
.
step_scopes
);
auto
inlinks
=
op
.
Inputs
(
name
.
inlinks
);
auto
inlink_alias
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
inlink_alias
);
PADDLE_ENFORCE
(
inlinks
.
size
()
==
inlink_alias
.
size
(),
"the size of inlinks and inlink_alias don't match:%d,%d"
,
inlinks
.
size
(),
inlink_alias
.
size
());
for
(
size_t
i
=
0
;
i
<
inlinks
.
size
();
++
i
)
{
rnn
::
Link
link
;
link
.
external
=
inlinks
[
i
];
link
.
internal
=
inlink_alias
[
i
];
(
arg
->
inlinks
).
push_back
(
link
);
}
auto
outlinks
=
op
.
Outputs
(
name
.
outlinks
);
auto
outlink_alias
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
outlink_alias
);
PADDLE_ENFORCE
(
outlinks
.
size
()
==
outlink_alias
.
size
(),
"the size of outlinks and outlink_alias don't match:%d,%d"
,
outlinks
.
size
(),
outlink_alias
.
size
());
for
(
size_t
i
=
0
;
i
<
outlinks
.
size
();
++
i
)
{
rnn
::
Link
link
;
link
.
external
=
outlinks
[
i
];
link
.
internal
=
outlink_alias
[
i
];
(
arg
->
outlinks
).
push_back
(
link
);
}
auto
boot_memories
=
op
.
Inputs
(
name
.
boot_memories
);
// attributes
auto
memories
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
memories
);
auto
pre_memories
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
pre_memories
);
PADDLE_ENFORCE
(
memories
.
size
()
==
boot_memories
.
size
(),
"the size of memories, boot_memories don't match:%d,%d"
,
memories
.
size
(),
boot_memories
.
size
());
PADDLE_ENFORCE
(
pre_memories
.
size
()
==
boot_memories
.
size
(),
"the size of pre_memories, boot_memories don't match:%d,%d"
,
pre_memories
.
size
(),
boot_memories
.
size
());
PADDLE_ENFORCE
(
memories
.
size
()
>
0
,
"more than 1 memories should be set"
);
for
(
size_t
i
=
0
;
i
<
memories
.
size
();
++
i
)
{
rnn
::
MemoryAttr
mem_attr
;
mem_attr
.
var
=
memories
[
i
];
mem_attr
.
pre_var
=
pre_memories
[
i
];
mem_attr
.
boot_var
=
boot_memories
[
i
];
(
arg
->
memories
).
push_back
(
mem_attr
);
}
}
}
// namespace rnn
void
RecurrentAlgorithm
::
InferShape
(
const
Scope
&
scope
)
const
{
void
RecurrentAlgorithm
::
InferShape
(
const
Scope
&
scope
)
const
{
seq_len_
=
scope
.
FindVar
((
arg_
->
inlinks
[
0
]).
external
)
seq_len_
=
scope
.
FindVar
((
arg_
->
inlinks
[
0
]).
external
)
->
GetMutable
<
Tensor
>
()
->
GetMutable
<
Tensor
>
()
->
dims
()[
0
];
->
dims
()[
0
];
CreateScopes
(
scope
);
CreateScopes
(
scope
);
auto
step_scopes
=
GetStepScopes
(
scope
);
auto
step_scopes
=
GetStepScopes
(
scope
);
rnn
::
SegmentInputs
(
rnn
::
SegmentInputs
(
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
InitMemories
(
step_scopes
[
0
],
true
/*infer_shape_mode*/
);
InitMemories
(
step_scopes
[
0
],
true
/*infer_shape_mode*/
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
for
(
size_t
i
=
0
;
i
<
seq_len_
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
seq_len_
;
i
++
)
{
if
(
i
>
0
)
{
if
(
i
>
0
)
{
rnn
::
LinkMemories
(
rnn
::
LinkMemories
(
step_scopes
,
arg_
->
memories
,
i
,
-
1
,
step_scopes
,
arg_
->
memories
,
i
,
-
1
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
}
}
net
->
GetMutable
<
NetOp
>
()
->
InferShape
(
*
step_scopes
[
i
]);
net
->
GetMutable
<
NetOp
>
()
->
InferShape
(
*
step_scopes
[
i
]);
}
}
rnn
::
ConcatOutputs
(
rnn
::
ConcatOutputs
(
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
}
}
void
RecurrentAlgorithm
::
Run
(
const
Scope
&
scope
,
void
RecurrentAlgorithm
::
Run
(
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
auto
step_scopes
=
GetStepScopes
(
scope
);
auto
step_scopes
=
GetStepScopes
(
scope
);
rnn
::
SegmentInputs
(
rnn
::
SegmentInputs
(
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
InitMemories
(
step_scopes
[
0
],
false
/*infer_shape_mode*/
);
InitMemories
(
step_scopes
[
0
],
false
/*infer_shape_mode*/
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
for
(
size_t
step_id
=
0
;
step_id
<
seq_len_
;
step_id
++
)
{
for
(
size_t
step_id
=
0
;
step_id
<
seq_len_
;
step_id
++
)
{
// create output alias variables
if
(
step_id
>
0
)
{
if
(
step_id
>
0
)
{
rnn
::
LinkMemories
(
rnn
::
LinkMemories
(
step_scopes
,
arg_
->
memories
,
step_id
,
-
1
,
step_scopes
,
arg_
->
memories
,
step_id
,
-
1
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
}
}
net
->
GetMutable
<
NetOp
>
()
->
Run
(
*
step_scopes
[
step_id
],
dev_ctx
);
net
->
GetMutable
<
NetOp
>
()
->
Run
(
*
step_scopes
[
step_id
],
dev_ctx
);
}
}
rnn
::
ConcatOutputs
(
rnn
::
ConcatOutputs
(
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
}
}
void
RecurrentAlgorithm
::
CreateScopes
(
const
Scope
&
scope
)
const
{
void
RecurrentAlgorithm
::
CreateScopes
(
const
Scope
&
scope
)
const
{
// TODO(
xxx
) Only two scopes are needed for inference, this case will be
// TODO(
superjom
) Only two scopes are needed for inference, this case will be
// supported later.
// supported later.
auto
step_scopes
=
auto
step_scopes_var
=
scope
.
FindVar
(
arg_
->
step_scopes
);
scope
.
FindVar
(
arg_
->
step_scopes
)
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
PADDLE_ENFORCE
(
step_scopes_var
!=
nullptr
,
""
);
auto
step_scopes
=
step_scopes_var
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
// Now all variables in scope must be created outside of op.
auto
net_var
=
scope
.
FindVar
(
arg_
->
step_net
);
PADDLE_ENFORCE
(
net_var
!=
nullptr
,
"no stepnet called %s in scope"
,
arg_
->
step_net
);
auto
net_op
=
net_var
->
GetMutable
<
NetOp
>
();
PADDLE_ENFORCE
(
!
net_op
->
outputs_
.
empty
(),
"net_op has no outputs"
);
if
(
seq_len_
>
step_scopes
->
size
())
{
if
(
seq_len_
>
step_scopes
->
size
())
{
for
(
size_t
i
=
step_scopes
->
size
();
i
<
seq_len_
;
++
i
)
{
for
(
size_t
i
=
step_scopes
->
size
();
i
<
seq_len_
;
++
i
)
{
auto
&
step_scope
=
scope
.
NewScope
();
auto
&
step_scope
=
scope
.
NewScope
();
// Now all variables in scope must be created outside of op.
// create step net's temp inputs
auto
net_op
=
scope
.
FindVar
(
arg_
->
step_net
)
->
GetMutable
<
NetOp
>
();
for
(
auto
&
input
:
net_op
->
inputs_
)
{
for
(
auto
&
input
:
net_op
->
inputs_
)
{
// the weight are located in parent scope
// the weight are located in parent scope
if
(
!
step_scope
.
FindVar
(
input
))
step_scope
.
NewVar
(
input
);
if
(
!
step_scope
.
FindVar
(
input
))
step_scope
.
NewVar
(
input
)
->
GetMutable
<
Tensor
>
();
}
}
for
(
auto
&
output
:
net_op
->
outputs_
)
{
// create stepnet's outputs
for
(
const
auto
&
output
:
net_op
->
outputs_
)
{
step_scope
.
NewVar
(
output
);
step_scope
.
NewVar
(
output
);
}
}
step_scopes
->
emplace_back
(
&
step_scope
);
step_scopes
->
emplace_back
(
&
step_scope
);
...
@@ -245,37 +106,27 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
...
@@ -245,37 +106,27 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
for
(
auto
&
attr
:
arg_
->
memories
)
{
for
(
auto
&
attr
:
arg_
->
memories
)
{
Tensor
*
pre_mem
=
step_scope
->
NewVar
(
attr
.
pre_var
)
->
GetMutable
<
Tensor
>
();
Tensor
*
pre_mem
=
step_scope
->
NewVar
(
attr
.
pre_var
)
->
GetMutable
<
Tensor
>
();
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
boot_var
)
!=
nullptr
,
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
boot_var
)
!=
nullptr
,
"memory [%s]'s boot variable [%s] not exists"
,
"memory [%s]'s boot variable [%s] not exists"
,
attr
.
var
,
attr
.
var
,
attr
.
boot_var
);
attr
.
boot_var
);
Tensor
*
boot_mem
=
step_scope
->
FindVar
(
attr
.
boot_var
)
->
GetMutable
<
Tensor
>
();
Tensor
*
boot_mem
=
step_scope
->
FindVar
(
attr
.
boot_var
)
->
GetMutable
<
Tensor
>
();
if
(
infer_shape_mode
)
{
if
(
infer_shape_mode
)
{
pre_mem
->
Resize
(
boot_mem
->
dims
());
pre_mem
->
Resize
(
boot_mem
->
dims
());
PADDLE_ENFORCE_EQ
(
pre_mem
->
dims
().
size
(),
2
);
}
else
{
}
else
{
pre_mem
->
ShareDataWith
<
float
>
(
*
boot_mem
);
pre_mem
->
ShareDataWith
<
float
>
(
*
boot_mem
);
}
}
}
}
}
}
const
rnn
::
ArgumentName
RecurrentOp
::
kArgName
{
"step_net"
,
const
rnn
::
ArgumentName
RecurrentOp
::
kArgName
{
"step_scopes"
,
"step_net"
,
"step_scopes"
,
"inlinks"
,
"inlinks"
,
"outlinks"
,
"inlink_alias"
,
"outlink_alias"
,
"outlinks"
,
"memories"
,
"pre_memories"
,
"boot_memories"
};
"inlink_alias"
,
"outlink_alias"
,
"memories"
,
"pre_memories"
,
"boot_memories"
};
const
rnn
::
ArgumentName
RecurrentGradientOp
::
kArgName
{
"step_net"
,
const
rnn
::
ArgumentName
RecurrentGradientOp
::
kArgName
{
"step_scopes"
,
"step_net"
,
"step_scopes"
,
"outlink@grad"
,
"outlink@grad"
,
"inlink@grad"
,
"inlink_alias"
,
"outlink_alias"
,
"inlink@grad"
,
"memories"
,
"pre_memories"
,
"boot_memories@grad"
};
"inlink_alias"
,
"outlink_alias"
,
"memories"
,
"pre_memories"
,
"boot_memories@grad"
};
void
RecurrentOp
::
Init
()
{
void
RecurrentOp
::
Init
()
{
OperatorBase
::
Init
();
OperatorBase
::
Init
();
...
@@ -285,7 +136,7 @@ void RecurrentOp::Init() {
...
@@ -285,7 +136,7 @@ void RecurrentOp::Init() {
}
}
class
RecurrentAlgorithmProtoAndCheckerMaker
:
public
OpProtoAndCheckerMaker
{
class
RecurrentAlgorithmProtoAndCheckerMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
RecurrentAlgorithmProtoAndCheckerMaker
(
OpProto
*
proto
,
RecurrentAlgorithmProtoAndCheckerMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
...
@@ -316,31 +167,29 @@ public:
...
@@ -316,31 +167,29 @@ public:
void
RecurrentGradientAlgorithm
::
Run
(
void
RecurrentGradientAlgorithm
::
Run
(
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
{
auto
step_scopes
=
GetStepScopes
(
scope
);
auto
step_scopes
=
GetStepScopes
(
scope
);
rnn
::
SegmentInputs
(
rnn
::
SegmentInputs
(
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
for
(
int
step_id
=
seq_len_
-
1
;
step_id
>=
0
;
--
step_id
)
{
for
(
int
step_id
=
seq_len_
-
1
;
step_id
>=
0
;
--
step_id
)
{
if
(
static_cast
<
size_t
>
(
step_id
)
!=
seq_len_
-
1
)
{
if
(
static_cast
<
size_t
>
(
step_id
)
!=
seq_len_
-
1
)
{
rnn
::
LinkMemories
(
rnn
::
LinkMemories
(
step_scopes
,
arg_
->
memories
,
step_id
,
1
,
step_scopes
,
arg_
->
memories
,
step_id
,
1
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
}
}
net
->
GetMutable
<
NetOp
>
()
->
Run
(
*
step_scopes
[
step_id
],
dev_ctx
);
net
->
GetMutable
<
NetOp
>
()
->
Run
(
*
step_scopes
[
step_id
],
dev_ctx
);
}
}
LinkBootMemoryGradients
(
step_scopes
[
0
],
false
);
LinkBootMemoryGradients
(
step_scopes
[
0
],
false
);
rnn
::
ConcatOutputs
(
rnn
::
ConcatOutputs
(
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
false
/*infer_shape_mode*/
);
false
/*infer_shape_mode*/
);
}
}
void
RecurrentGradientAlgorithm
::
LinkBootMemoryGradients
(
void
RecurrentGradientAlgorithm
::
LinkBootMemoryGradients
(
Scope
*
step_scope
,
bool
infer_shape_mode
)
const
{
Scope
*
step_scope
,
bool
infer_shape_mode
)
const
{
for
(
auto
&
attr
:
arg_
->
memories
)
{
for
(
auto
&
attr
:
arg_
->
memories
)
{
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
var
)
!=
nullptr
,
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
var
)
!=
nullptr
,
"memory variable [%s] does not exists"
,
"memory variable [%s] does not exists"
,
attr
.
var
);
attr
.
var
);
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
boot_var
)
!=
nullptr
,
PADDLE_ENFORCE
(
step_scope
->
FindVar
(
attr
.
boot_var
)
!=
nullptr
,
"boot variable [%s] does not exists"
,
"boot variable [%s] does not exists"
,
attr
.
boot_var
);
attr
.
boot_var
);
Tensor
*
mem_grad
=
step_scope
->
NewVar
(
attr
.
var
)
->
GetMutable
<
Tensor
>
();
Tensor
*
mem_grad
=
step_scope
->
NewVar
(
attr
.
var
)
->
GetMutable
<
Tensor
>
();
Tensor
*
boot_mem_grad
=
Tensor
*
boot_mem_grad
=
step_scope
->
NewVar
(
attr
.
boot_var
)
->
GetMutable
<
Tensor
>
();
step_scope
->
NewVar
(
attr
.
boot_var
)
->
GetMutable
<
Tensor
>
();
...
@@ -357,19 +206,19 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
...
@@ -357,19 +206,19 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
->
GetMutable
<
Tensor
>
()
->
GetMutable
<
Tensor
>
()
->
dims
()[
0
];
->
dims
()[
0
];
auto
step_scopes
=
GetStepScopes
(
scope
);
auto
step_scopes
=
GetStepScopes
(
scope
);
rnn
::
SegmentInputs
(
rnn
::
SegmentInputs
(
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
step_scopes
,
arg_
->
inlinks
,
seq_len_
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
Variable
*
net
=
scope
.
FindVar
(
arg_
->
step_net
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
PADDLE_ENFORCE
(
net
!=
nullptr
,
"failed to get step net"
);
for
(
int
step_id
=
seq_len_
-
1
;
step_id
>=
0
;
--
step_id
)
{
for
(
int
step_id
=
seq_len_
-
1
;
step_id
>=
0
;
--
step_id
)
{
if
(
static_cast
<
size_t
>
(
step_id
)
!=
seq_len_
-
1
)
{
if
(
static_cast
<
size_t
>
(
step_id
)
!=
seq_len_
-
1
)
{
rnn
::
LinkMemories
(
rnn
::
LinkMemories
(
step_scopes
,
arg_
->
memories
,
step_id
,
1
,
step_scopes
,
arg_
->
memories
,
step_id
,
1
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
}
}
net
->
GetMutable
<
NetOp
>
()
->
InferShape
(
*
step_scopes
[
step_id
]);
net
->
GetMutable
<
NetOp
>
()
->
InferShape
(
*
step_scopes
[
step_id
]);
}
}
rnn
::
ConcatOutputs
(
rnn
::
ConcatOutputs
(
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
step_scopes
,
arg_
->
outlinks
,
seq_len_
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
LinkBootMemoryGradients
(
step_scopes
[
0
],
true
/*infer_shape_mode*/
);
LinkBootMemoryGradients
(
step_scopes
[
0
],
true
/*infer_shape_mode*/
);
}
}
...
@@ -383,6 +232,5 @@ void RecurrentGradientOp::Init() {
...
@@ -383,6 +232,5 @@ void RecurrentGradientOp::Init() {
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_OP
(
recurrent_op
,
REGISTER_OP
(
recurrent_op
,
paddle
::
operators
::
RecurrentOp
,
paddle
::
operators
::
RecurrentOp
,
paddle
::
operators
::
RecurrentAlgorithmProtoAndCheckerMaker
);
paddle
::
operators
::
RecurrentAlgorithmProtoAndCheckerMaker
);
paddle/operators/recurrent_op.h
浏览文件 @
59a8ebc6
...
@@ -15,82 +15,11 @@
...
@@ -15,82 +15,11 @@
#pragma once
#pragma once
#include "paddle/framework/operator.h"
#include "paddle/framework/operator.h"
#include "paddle/operators/rnn/recurrent_op_utils.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
rnn
{
/**
* Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
*
* Memory attributes cached by this op, dims will be infered from
* boot memories in father scope. Other attributes are copied from Op's proto
* attributes.
*/
struct
MemoryAttr
{
// name of current state variable
std
::
string
var
;
// name of previous step's state variable
std
::
string
pre_var
;
// name of the variables to init this memory (same role of `boot_layer` in
// PaddlePaddle), which is store in father's scope.
std
::
string
boot_var
;
};
struct
Link
{
// input or output links name.
std
::
string
internal
;
// alias to avoid duplicate keys in scopes.
std
::
string
external
;
};
struct
Argument
{
std
::
string
step_net
;
std
::
string
step_scopes
;
std
::
vector
<
Link
>
inlinks
;
std
::
vector
<
Link
>
outlinks
;
std
::
vector
<
rnn
::
MemoryAttr
>
memories
;
};
struct
ArgumentName
{
std
::
string
step_net
;
std
::
string
step_scopes
;
std
::
string
inlinks
;
std
::
string
outlinks
;
std
::
string
inlink_alias
;
// the alias of inlinks in step net.
std
::
string
outlink_alias
;
// the alias of outlinks in step net.
std
::
string
memories
;
// the memory name
std
::
string
pre_memories
;
// the previous memory name
std
::
string
boot_memories
;
// the boot memory name
};
/**
* Prepare inputs for each step net.
*/
void
SegmentInputs
(
const
std
::
vector
<
framework
::
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
inlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
);
/**
* Process outputs of step nets and merge to variables.
*/
void
ConcatOutputs
(
const
std
::
vector
<
framework
::
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
outlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
);
void
LinkMemories
(
const
std
::
vector
<
framework
::
Scope
*>&
step_scopes
,
const
std
::
vector
<
MemoryAttr
>&
memories
,
const
size_t
step_id
,
const
int
offset
,
bool
infer_shape_mode
);
void
InitArgument
(
const
ArgumentName
&
name
,
Argument
*
arg
);
};
// namespace rnn
// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
// TODO(Yan Chunwei):
// TODO(Yan Chunwei):
// 1. No-padding computing for sequences with indifinite length in one batch.
// 1. No-padding computing for sequences with indifinite length in one batch.
...
@@ -100,7 +29,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
...
@@ -100,7 +29,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
// Refer to: https://arxiv.org/pdf/1502.02367.pdf
// Refer to: https://arxiv.org/pdf/1502.02367.pdf
class
RecurrentAlgorithm
{
class
RecurrentAlgorithm
{
public:
public:
void
Run
(
const
framework
::
Scope
&
scope
,
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
;
const
platform
::
DeviceContext
&
dev_ctx
)
const
;
...
@@ -111,7 +40,7 @@ public:
...
@@ -111,7 +40,7 @@ public:
*/
*/
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
;
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
;
protected:
protected:
/*
/*
* The step scopes will be stored in the father scope as a variable.
* The step scopes will be stored in the father scope as a variable.
*
*
...
@@ -128,7 +57,7 @@ protected:
...
@@ -128,7 +57,7 @@ protected:
void
InitMemories
(
framework
::
Scope
*
step_scopes
,
bool
infer_shape_mode
)
const
;
void
InitMemories
(
framework
::
Scope
*
step_scopes
,
bool
infer_shape_mode
)
const
;
private:
private:
std
::
unique_ptr
<
rnn
::
Argument
>
arg_
;
std
::
unique_ptr
<
rnn
::
Argument
>
arg_
;
mutable
size_t
seq_len_
;
mutable
size_t
seq_len_
;
};
};
...
@@ -144,7 +73,7 @@ class RecurrentGradientAlgorithm {
...
@@ -144,7 +73,7 @@ class RecurrentGradientAlgorithm {
* lot, and the latter is a wrapper acts like an dapter for it to make RNN an
* lot, and the latter is a wrapper acts like an dapter for it to make RNN an
* operator.
* operator.
*/
*/
public:
public:
void
Init
(
std
::
unique_ptr
<
rnn
::
Argument
>
arg
)
{
arg_
=
std
::
move
(
arg
);
}
void
Init
(
std
::
unique_ptr
<
rnn
::
Argument
>
arg
)
{
arg_
=
std
::
move
(
arg
);
}
void
Run
(
const
framework
::
Scope
&
scope
,
void
Run
(
const
framework
::
Scope
&
scope
,
...
@@ -158,20 +87,20 @@ public:
...
@@ -158,20 +87,20 @@ public:
*/
*/
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
;
void
InferShape
(
const
framework
::
Scope
&
scope
)
const
;
protected:
protected:
inline
const
std
::
vector
<
framework
::
Scope
*>&
GetStepScopes
(
inline
const
std
::
vector
<
framework
::
Scope
*>&
GetStepScopes
(
const
framework
::
Scope
&
scope
)
const
{
const
framework
::
Scope
&
scope
)
const
{
return
*
scope
.
FindVar
(
arg_
->
step_scopes
)
return
*
scope
.
FindVar
(
arg_
->
step_scopes
)
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
}
}
private:
private:
std
::
unique_ptr
<
rnn
::
Argument
>
arg_
;
std
::
unique_ptr
<
rnn
::
Argument
>
arg_
;
mutable
size_t
seq_len_
;
mutable
size_t
seq_len_
;
};
};
class
RecurrentOp
final
:
public
framework
::
OperatorBase
{
class
RecurrentOp
final
:
public
framework
::
OperatorBase
{
public:
public:
void
Init
()
override
;
void
Init
()
override
;
/**
/**
...
@@ -188,12 +117,12 @@ public:
...
@@ -188,12 +117,12 @@ public:
static
const
rnn
::
ArgumentName
kArgName
;
static
const
rnn
::
ArgumentName
kArgName
;
private:
private:
RecurrentAlgorithm
alg_
;
RecurrentAlgorithm
alg_
;
};
};
class
RecurrentGradientOp
final
:
public
framework
::
OperatorBase
{
class
RecurrentGradientOp
final
:
public
framework
::
OperatorBase
{
public:
public:
void
Init
()
override
;
void
Init
()
override
;
/**
/**
...
@@ -210,7 +139,7 @@ public:
...
@@ -210,7 +139,7 @@ public:
static
const
rnn
::
ArgumentName
kArgName
;
static
const
rnn
::
ArgumentName
kArgName
;
private:
private:
RecurrentGradientAlgorithm
alg_
;
RecurrentGradientAlgorithm
alg_
;
};
};
...
...
paddle/operators/recurrent_op_test.cc
浏览文件 @
59a8ebc6
...
@@ -29,7 +29,7 @@ using framework::make_ddim;
...
@@ -29,7 +29,7 @@ using framework::make_ddim;
using
framework
::
DDim
;
using
framework
::
DDim
;
class
RecurrentOpTest
:
public
::
testing
::
Test
{
class
RecurrentOpTest
:
public
::
testing
::
Test
{
protected:
protected:
virtual
void
SetUp
()
override
{
virtual
void
SetUp
()
override
{
CreateGlobalVariables
();
CreateGlobalVariables
();
CreateStepNet
();
CreateStepNet
();
...
@@ -174,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) {
...
@@ -174,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) {
}
}
class
RecurrentGradientAlgorithmTest
:
public
::
testing
::
Test
{
class
RecurrentGradientAlgorithmTest
:
public
::
testing
::
Test
{
protected:
protected:
virtual
void
SetUp
()
override
{
virtual
void
SetUp
()
override
{
CreateGlobalVariables
();
CreateGlobalVariables
();
CreateStepScopes
();
CreateStepScopes
();
...
@@ -277,13 +277,11 @@ protected:
...
@@ -277,13 +277,11 @@ protected:
LOG
(
INFO
)
<<
"create variable step_net"
;
LOG
(
INFO
)
<<
"create variable step_net"
;
Variable
*
var
=
scope_
.
NewVar
(
"step_net"
);
Variable
*
var
=
scope_
.
NewVar
(
"step_net"
);
auto
net
=
var
->
GetMutable
<
NetOp
>
();
auto
net
=
var
->
GetMutable
<
NetOp
>
();
net
->
AddOp
(
OpRegistry
::
CreateOp
(
"mul"
,
net
->
AddOp
(
OpRegistry
::
CreateOp
(
"mul"
,
{
"rnn/h_pre"
,
"rnn/w"
,
"rnn/s_grad"
},
{
"rnn/h_pre"
,
"rnn/w"
,
"rnn/s_grad"
},
{
"rnn/h_pre_grad"
,
"rnn/w_grad"
},
{}));
{
"rnn/h_pre_grad"
,
"rnn/w_grad"
},
{}));
net
->
AddOp
(
OpRegistry
::
CreateOp
(
net
->
AddOp
(
OpRegistry
::
CreateOp
(
"add_two"
,
{
"rnn/h_grad"
},
"add_two"
,
{
"rnn/h_grad"
},
{
"rnn/x_grad"
,
"rnn/s_grad"
},
{}));
{
"rnn/x_grad"
,
"rnn/s_grad"
},
{}));
net
->
CompleteAddOp
();
net
->
CompleteAddOp
();
}
}
...
@@ -297,9 +295,7 @@ protected:
...
@@ -297,9 +295,7 @@ protected:
inlink
.
internal
=
"rnn/x"
;
inlink
.
internal
=
"rnn/x"
;
auto
step_scopes
=
auto
step_scopes
=
scope_
.
FindVar
(
"step_scopes"
)
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
scope_
.
FindVar
(
"step_scopes"
)
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
rnn
::
SegmentInputs
(
*
step_scopes
,
rnn
::
SegmentInputs
(
*
step_scopes
,
std
::
vector
<
rnn
::
Link
>
{
inlink
},
10
,
std
::
vector
<
rnn
::
Link
>
{
inlink
},
10
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
}
}
...
@@ -314,8 +310,8 @@ protected:
...
@@ -314,8 +310,8 @@ protected:
auto
step_scopes
=
auto
step_scopes
=
scope_
.
FindVar
(
"step_scopes"
)
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
scope_
.
FindVar
(
"step_scopes"
)
->
GetMutable
<
std
::
vector
<
Scope
*>>
();
for
(
int
i
=
1
;
i
<
10
;
++
i
)
{
for
(
int
i
=
1
;
i
<
10
;
++
i
)
{
rnn
::
LinkMemories
(
rnn
::
LinkMemories
(
*
step_scopes
,
memories
,
i
,
-
1
,
*
step_scopes
,
memories
,
i
,
-
1
,
true
/*infer_shape_mode*/
);
true
/*infer_shape_mode*/
);
}
}
}
}
...
@@ -395,3 +391,4 @@ TEST(RecurrentOp, LinkMemories) {
...
@@ -395,3 +391,4 @@ TEST(RecurrentOp, LinkMemories) {
USE_OP
(
add_two
);
USE_OP
(
add_two
);
USE_OP
(
mul
);
USE_OP
(
mul
);
USE_OP_WITHOUT_KERNEL
(
recurrent_op
);
paddle/operators/rnn/recurrent_op_utils.cc
0 → 100644
浏览文件 @
59a8ebc6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/rnn/recurrent_op_utils.h"
namespace
paddle
{
namespace
operators
{
namespace
rnn
{
namespace
fmw
=
paddle
::
framework
;
void
SegmentInputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
inlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
)
{
PADDLE_ENFORCE
(
!
inlinks
.
empty
(),
"no in links are provided."
);
for
(
size_t
i
=
0
;
i
<
inlinks
.
size
();
++
i
)
{
auto
input_var
=
step_scopes
[
0
]
->
FindVar
(
inlinks
[
i
].
external
);
PADDLE_ENFORCE
(
input_var
!=
nullptr
,
"input link [%s] is not in scope."
,
inlinks
[
i
].
external
);
Tensor
*
input
=
input_var
->
GetMutable
<
Tensor
>
();
fmw
::
DDim
dims
=
input
->
dims
();
PADDLE_ENFORCE
(
static_cast
<
size_t
>
(
dims
[
0
])
==
seq_len
,
"all the inlinks must have same length"
);
fmw
::
DDim
step_dims
=
slice_ddim
(
dims
,
1
,
dims
.
size
());
for
(
size_t
j
=
0
;
j
<
seq_len
;
j
++
)
{
Tensor
*
step_input
=
step_scopes
[
j
]
->
NewVar
(
inlinks
[
i
].
internal
)
->
GetMutable
<
Tensor
>
();
if
(
!
infer_shape_mode
)
{
*
step_input
=
input
->
Slice
<
float
>
(
j
,
j
+
1
);
}
step_input
->
Resize
(
step_dims
);
}
}
}
void
ConcatOutputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
outlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
)
{
for
(
size_t
i
=
0
;
i
<
outlinks
.
size
();
i
++
)
{
auto
output_var
=
step_scopes
[
0
]
->
FindVar
(
outlinks
[
i
].
external
);
PADDLE_ENFORCE
(
output_var
!=
nullptr
,
"output link [%s] is not in scope."
,
outlinks
[
i
].
external
);
Tensor
*
output
=
output_var
->
GetMutable
<
Tensor
>
();
if
(
infer_shape_mode
)
{
auto
step_scope_var
=
step_scopes
[
0
]
->
FindVar
(
outlinks
[
i
].
internal
);
PADDLE_ENFORCE
(
step_scope_var
!=
nullptr
,
"%s not in scope"
,
outlinks
[
i
].
internal
);
fmw
::
DDim
step_dims
=
step_scope_var
->
template
GetMutable
<
Tensor
>()
->
dims
();
std
::
vector
<
int
>
dims_vec
=
vectorize
(
step_dims
);
dims_vec
.
insert
(
dims_vec
.
begin
(),
seq_len
);
output
->
Resize
(
fmw
::
make_ddim
(
dims_vec
));
}
else
{
output
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
size_t
j
=
0
;
j
<
seq_len
;
j
++
)
{
Tensor
*
step_output
=
step_scopes
[
j
]
->
FindVar
(
outlinks
[
i
].
internal
)
->
GetMutable
<
Tensor
>
();
// TODO(luotao02) data type and platform::DeviceContext() should set
// correctly
(
output
->
Slice
<
float
>
(
j
,
j
+
1
))
.
CopyFrom
<
float
>
(
*
step_output
,
platform
::
CPUPlace
());
}
}
}
}
void
LinkMemories
(
const
std
::
vector
<
Scope
*>&
scopes
,
const
std
::
vector
<
rnn
::
MemoryAttr
>&
memories
,
const
size_t
step_id
,
const
int
offset
,
bool
infer_shape_mode
)
{
PADDLE_ENFORCE_LT
(
step_id
,
scopes
.
size
(),
"step [%d] is out of range of step scopes' size [%d]"
,
step_id
,
scopes
.
size
());
PADDLE_ENFORCE_GE
(
static_cast
<
int
>
(
step_id
)
+
offset
,
0
,
"offset [%d] must be large than -[%d]"
,
offset
,
step_id
);
PADDLE_ENFORCE_LT
(
step_id
+
offset
,
scopes
.
size
(),
"offset [%d] is out of range, it must be less than (%d - %d)"
,
offset
,
scopes
.
size
(),
step_id
);
auto
scope
=
scopes
[
step_id
];
auto
linked_scope
=
scopes
[
step_id
+
offset
];
for
(
auto
&
attr
:
memories
)
{
auto
mem
=
scope
->
FindVar
(
attr
.
pre_var
)
->
GetMutable
<
Tensor
>
();
auto
linked_mem
=
linked_scope
->
FindVar
(
attr
.
var
)
->
GetMutable
<
Tensor
>
();
if
(
infer_shape_mode
)
{
mem
->
Resize
(
linked_mem
->
dims
());
}
else
{
mem
->
ShareDataWith
<
float
>
(
*
linked_mem
);
}
}
}
void
InitArgument
(
const
ArgumentName
&
name
,
Argument
*
arg
,
const
OperatorBase
&
op
)
{
arg
->
step_net
=
op
.
Input
(
name
.
step_net
);
arg
->
step_scopes
=
op
.
Output
(
name
.
step_scopes
);
auto
inlinks
=
op
.
Inputs
(
name
.
inlinks
);
auto
inlink_alias
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
inlink_alias
);
PADDLE_ENFORCE
(
inlinks
.
size
()
==
inlink_alias
.
size
(),
"the size of inlinks and inlink_alias don't match:%d,%d"
,
inlinks
.
size
(),
inlink_alias
.
size
());
for
(
size_t
i
=
0
;
i
<
inlinks
.
size
();
++
i
)
{
rnn
::
Link
link
;
link
.
external
=
inlinks
[
i
];
link
.
internal
=
inlink_alias
[
i
];
(
arg
->
inlinks
).
push_back
(
link
);
}
auto
outlinks
=
op
.
Outputs
(
name
.
outlinks
);
auto
outlink_alias
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
outlink_alias
);
PADDLE_ENFORCE
(
outlinks
.
size
()
==
outlink_alias
.
size
(),
"the size of outlinks and outlink_alias don't match:%d,%d"
,
outlinks
.
size
(),
outlink_alias
.
size
());
for
(
size_t
i
=
0
;
i
<
outlinks
.
size
();
++
i
)
{
rnn
::
Link
link
;
link
.
external
=
outlinks
[
i
];
link
.
internal
=
outlink_alias
[
i
];
(
arg
->
outlinks
).
push_back
(
link
);
}
auto
boot_memories
=
op
.
Inputs
(
name
.
boot_memories
);
// attributes
auto
memories
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
memories
);
auto
pre_memories
=
op
.
GetAttr
<
std
::
vector
<
std
::
string
>>
(
name
.
pre_memories
);
PADDLE_ENFORCE
(
memories
.
size
()
==
boot_memories
.
size
(),
"the size of memories, boot_memories don't match:%d,%d"
,
memories
.
size
(),
boot_memories
.
size
());
PADDLE_ENFORCE
(
pre_memories
.
size
()
==
boot_memories
.
size
(),
"the size of pre_memories, boot_memories don't match:%d,%d"
,
pre_memories
.
size
(),
boot_memories
.
size
());
PADDLE_ENFORCE
(
memories
.
size
()
>
0
,
"more than 1 memories should be set"
);
for
(
size_t
i
=
0
;
i
<
memories
.
size
();
++
i
)
{
rnn
::
MemoryAttr
mem_attr
;
mem_attr
.
var
=
memories
[
i
];
mem_attr
.
pre_var
=
pre_memories
[
i
];
mem_attr
.
boot_var
=
boot_memories
[
i
];
(
arg
->
memories
).
push_back
(
mem_attr
);
}
}
}
// namespace rnn
}
// namespace operators
}
// namespace paddle
paddle/operators/rnn/recurrent_op_utils.h
0 → 100644
浏览文件 @
59a8ebc6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/framework/operator.h"
#include "paddle/operators/type_alias.h"
namespace
paddle
{
namespace
operators
{
namespace
rnn
{
/**
* Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
*
* Memory attributes cached by this op, dims will be infered from
* boot memories in father scope. Other attributes are copied from Op's proto
* attributes.
*/
struct
MemoryAttr
{
// name of current state variable
std
::
string
var
;
// name of previous step's state variable
std
::
string
pre_var
;
// name of the variables to init this memory (same role of `boot_layer` in
// PaddlePaddle), which is store in father's scope.
std
::
string
boot_var
;
};
struct
Link
{
// input or output links name.
std
::
string
internal
;
// alias to avoid duplicate keys in scopes.
std
::
string
external
;
};
struct
Argument
{
std
::
string
step_net
;
std
::
string
step_scopes
;
std
::
vector
<
Link
>
inlinks
;
std
::
vector
<
Link
>
outlinks
;
std
::
vector
<
rnn
::
MemoryAttr
>
memories
;
};
struct
ArgumentName
{
std
::
string
step_net
;
std
::
string
step_scopes
;
std
::
string
inlinks
;
std
::
string
outlinks
;
std
::
string
inlink_alias
;
// the alias of inlinks in step net.
std
::
string
outlink_alias
;
// the alias of outlinks in step net.
std
::
string
memories
;
// the memory name
std
::
string
pre_memories
;
// the previous memory name
std
::
string
boot_memories
;
// the boot memory name
};
/**
* Prepare inputs for each step net.
*/
void
SegmentInputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
inlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
);
/**
* Process outputs of step nets and merge to variables.
*/
void
ConcatOutputs
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
Link
>&
outlinks
,
const
size_t
seq_len
,
bool
infer_shape_mode
);
void
LinkMemories
(
const
std
::
vector
<
Scope
*>&
step_scopes
,
const
std
::
vector
<
MemoryAttr
>&
memories
,
const
size_t
step_id
,
const
int
offset
,
bool
infer_shape_mode
);
void
InitArgument
(
const
ArgumentName
&
name
,
Argument
*
arg
,
const
OperatorBase
&
op
);
}
// namespace rnn
}
// namespace operators
}
// namespace paddle
paddle/operators/rowwise_add_op.cc
浏览文件 @
59a8ebc6
...
@@ -17,7 +17,7 @@ namespace paddle {
...
@@ -17,7 +17,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
RowWiseAddOp
:
public
OperatorWithKernel
{
class
RowWiseAddOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2UL
,
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2UL
,
"Two inputs is needed by rowwise add"
);
"Two inputs is needed by rowwise add"
);
...
@@ -33,7 +33,7 @@ protected:
...
@@ -33,7 +33,7 @@ protected:
};
};
class
RowWiseAddOpMaker
:
public
OpProtoAndCheckerMaker
{
class
RowWiseAddOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
RowWiseAddOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
RowWiseAddOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The left input of row-wise add op, must be matrix"
);
AddInput
(
"X"
,
"The left input of row-wise add op, must be matrix"
);
...
...
paddle/operators/rowwise_add_op.h
浏览文件 @
59a8ebc6
...
@@ -20,7 +20,7 @@ namespace operators {
...
@@ -20,7 +20,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
RowWiseAddKernel
:
public
OpKernel
{
class
RowWiseAddKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
out
=
context
.
Output
<
Tensor
>
(
0
);
auto
out
=
context
.
Output
<
Tensor
>
(
0
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
...
...
paddle/operators/sgd_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,7 +18,7 @@ namespace paddle {
...
@@ -18,7 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
SGDOp
:
public
OperatorWithKernel
{
class
SGDOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"Input size of SGDOp must be two"
);
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
2
,
"Input size of SGDOp must be two"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Output size of SGDOp must be one"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Output size of SGDOp must be one"
);
...
@@ -32,7 +32,7 @@ protected:
...
@@ -32,7 +32,7 @@ protected:
};
};
class
SGDOpMaker
:
public
OpProtoAndCheckerMaker
{
class
SGDOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
SGDOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
SGDOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"param"
,
"input parameter"
);
AddInput
(
"param"
,
"input parameter"
);
...
...
paddle/operators/sgd_op.h
浏览文件 @
59a8ebc6
...
@@ -20,7 +20,7 @@ namespace operators {
...
@@ -20,7 +20,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
SGDOpKernel
:
public
OpKernel
{
class
SGDOpKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
auto
param
=
ctx
.
Input
<
Tensor
>
(
"param"
);
auto
param
=
ctx
.
Input
<
Tensor
>
(
"param"
);
auto
grad
=
ctx
.
Input
<
Tensor
>
(
"grad"
);
auto
grad
=
ctx
.
Input
<
Tensor
>
(
"grad"
);
...
...
paddle/operators/sigmoid_op.cc
浏览文件 @
59a8ebc6
...
@@ -17,7 +17,7 @@ namespace paddle {
...
@@ -17,7 +17,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
SigmoidOp
:
public
OperatorWithKernel
{
class
SigmoidOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1
,
"Sigmoid Op only have one input"
);
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1
,
"Sigmoid Op only have one input"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Sigmoid Op only have one output"
);
PADDLE_ENFORCE
(
ctx
.
OutputSize
()
==
1
,
"Sigmoid Op only have one output"
);
...
@@ -26,7 +26,7 @@ protected:
...
@@ -26,7 +26,7 @@ protected:
};
};
class
SigmoidOpMaker
:
public
OpProtoAndCheckerMaker
{
class
SigmoidOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
SigmoidOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
SigmoidOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"sigmoid input"
);
AddInput
(
"X"
,
"sigmoid input"
);
...
@@ -36,11 +36,9 @@ public:
...
@@ -36,11 +36,9 @@ public:
};
};
class
SigmoidOpGrad
:
public
OperatorWithKernel
{
class
SigmoidOpGrad
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{}
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
std
::
string
DebugString
()
const
override
{
ctx
.
Output
<
Tensor
>
(
0
)
->
Resize
(
ctx
.
Input
<
Tensor
>
(
0
)
->
dims
());
LOG
(
INFO
)
<<
"SigmoidGrad"
;
return
""
;
}
}
};
};
...
@@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
...
@@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
REGISTER_GRADIENT_OP
(
sigmoid
,
sigmoid_grad
,
ops
::
SigmoidOpGrad
);
REGISTER_GRADIENT_OP
(
sigmoid
,
sigmoid_grad
,
ops
::
SigmoidOpGrad
);
REGISTER_OP_CPU_KERNEL
(
sigmoid
,
ops
::
SigmoidKernel
<
ops
::
CPUPlace
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
sigmoid
,
ops
::
SigmoidKernel
<
ops
::
CPUPlace
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
sigmoid_grad
,
ops
::
SigmoidGradKernel
<
ops
::
CPUPlace
,
float
>
);
paddle/operators/sigmoid_op.cu
浏览文件 @
59a8ebc6
...
@@ -16,3 +16,5 @@
...
@@ -16,3 +16,5 @@
#include "paddle/operators/sigmoid_op.h"
#include "paddle/operators/sigmoid_op.h"
REGISTER_OP_GPU_KERNEL
(
sigmoid
,
ops
::
SigmoidKernel
<
ops
::
GPUPlace
,
float
>
);
REGISTER_OP_GPU_KERNEL
(
sigmoid
,
ops
::
SigmoidKernel
<
ops
::
GPUPlace
,
float
>
);
REGISTER_OP_GPU_KERNEL
(
sigmoid_grad
,
ops
::
SigmoidGradKernel
<
ops
::
GPUPlace
,
float
>
);
paddle/operators/sigmoid_op.h
浏览文件 @
59a8ebc6
...
@@ -21,12 +21,13 @@ namespace operators {
...
@@ -21,12 +21,13 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
SigmoidKernel
:
public
OpKernel
{
class
SigmoidKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
input
=
context
.
Input
<
Tensor
>
(
0
);
auto
input
=
context
.
Input
<
Tensor
>
(
0
);
auto
output
=
context
.
Output
<
Tensor
>
(
0
);
auto
output
=
context
.
Output
<
Tensor
>
(
0
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// The clipping is used in Paddle's raw implenmention
auto
X
=
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
X
=
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
Y
=
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
Y
=
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
place
=
context
.
GetEigenDevice
<
Place
>
();
auto
place
=
context
.
GetEigenDevice
<
Place
>
();
...
@@ -34,5 +35,23 @@ public:
...
@@ -34,5 +35,23 @@ public:
Y
.
device
(
place
)
=
1.0
/
(
1.0
+
(
-
1.0
*
X
).
exp
());
Y
.
device
(
place
)
=
1.0
/
(
1.0
+
(
-
1.0
*
X
).
exp
());
}
}
};
};
template
<
typename
Place
,
typename
T
>
class
SigmoidGradKernel
:
public
OpKernel
{
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
Y_t
=
context
.
Input
<
Tensor
>
(
"Y"
);
auto
dY_t
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
dX_t
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
dX_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
dX
=
EigenVector
<
T
>::
Flatten
(
*
dX_t
);
auto
Y
=
EigenVector
<
T
>::
Flatten
(
*
Y_t
);
auto
dY
=
EigenVector
<
T
>::
Flatten
(
*
dY_t
);
dX
.
device
(
context
.
GetEigenDevice
<
Place
>
())
=
dY
*
Y
*
(
1.
-
Y
);
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/operators/softmax_op.cc
浏览文件 @
59a8ebc6
...
@@ -18,7 +18,7 @@ namespace paddle {
...
@@ -18,7 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
class
SoftmaxOp
:
public
OperatorWithKernel
{
class
SoftmaxOp
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1UL
,
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
1UL
,
"Only one input is need for softmax"
);
"Only one input is need for softmax"
);
...
@@ -31,7 +31,7 @@ protected:
...
@@ -31,7 +31,7 @@ protected:
};
};
class
SoftmaxOpMaker
:
public
OpProtoAndCheckerMaker
{
class
SoftmaxOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
public:
SoftmaxOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
SoftmaxOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"input of softmax"
);
AddInput
(
"X"
,
"input of softmax"
);
...
@@ -41,7 +41,7 @@ public:
...
@@ -41,7 +41,7 @@ public:
};
};
class
SoftmaxOpGrad
:
public
OperatorWithKernel
{
class
SoftmaxOpGrad
:
public
OperatorWithKernel
{
protected:
protected:
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
void
InferShape
(
const
InferShapeContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
3UL
,
PADDLE_ENFORCE
(
ctx
.
InputSize
()
==
3UL
,
"Input of SoftmaxOpGrad should be 3, X, Y, YG"
);
"Input of SoftmaxOpGrad should be 3, X, Y, YG"
);
...
...
paddle/operators/softmax_op.h
浏览文件 @
59a8ebc6
...
@@ -24,7 +24,7 @@ namespace operators {
...
@@ -24,7 +24,7 @@ namespace operators {
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
SoftmaxKernel
:
public
OpKernel
{
class
SoftmaxKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
auto
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
output
=
context
.
Output
<
Tensor
>
(
"Y"
);
auto
output
=
context
.
Output
<
Tensor
>
(
"Y"
);
...
@@ -63,7 +63,7 @@ public:
...
@@ -63,7 +63,7 @@ public:
template
<
typename
Place
,
typename
T
>
template
<
typename
Place
,
typename
T
>
class
SoftmaxGradKernel
:
public
OpKernel
{
class
SoftmaxGradKernel
:
public
OpKernel
{
public:
public:
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
ExecutionContext
&
context
)
const
override
{
std
::
shared_ptr
<
Tensor
>
scale_
=
std
::
make_shared
<
Tensor
>
();
std
::
shared_ptr
<
Tensor
>
scale_
=
std
::
make_shared
<
Tensor
>
();
...
...
paddle/operators/type_alias.h
浏览文件 @
59a8ebc6
...
@@ -26,21 +26,16 @@ using OperatorBase = framework::OperatorBase;
...
@@ -26,21 +26,16 @@ using OperatorBase = framework::OperatorBase;
using
InferShapeContext
=
framework
::
InferShapeContext
;
using
InferShapeContext
=
framework
::
InferShapeContext
;
using
ExecutionContext
=
framework
::
ExecutionContext
;
using
ExecutionContext
=
framework
::
ExecutionContext
;
using
Variable
=
framework
::
Variable
;
using
Variable
=
framework
::
Variable
;
template
<
typename
T
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenScalar
=
framework
::
EigenScalar
<
T
,
MajorType
,
IndexType
>
;
using
EigenScalar
=
framework
::
EigenScalar
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
...
...
paddle/scripts/docker/build.sh
浏览文件 @
59a8ebc6
...
@@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ...
...
@@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ...
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
========================================
========================================
EOF
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake ..
\
cmake ..
\
-DCMAKE_BUILD_TYPE
=
Release
\
-DCMAKE_BUILD_TYPE
=
Release
\
-DWITH_DOC
=
OFF
\
-DWITH_DOC
=
OFF
\
...
@@ -52,39 +56,43 @@ cmake .. \
...
@@ -52,39 +56,43 @@ cmake .. \
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
cat
<<
EOF
cat
<<
EOF
========================================
========================================
====
Building in /paddle/build ...
Building in /paddle/build ...
Build unit tests:
${
WITH_TESTING
:-
OFF
}
Build unit tests:
${
WITH_TESTING
:-
OFF
}
========================================
========================================
====
EOF
EOF
make
-j
`
nproc
`
make
-j
`
nproc
`
if
[
${
WITH_TESTING
:-
OFF
}
==
"ON"
]
&&
[
${
RUN_TEST
:-
OFF
}
==
"ON"
]
;
then
pip uninstall
-y
py-paddle paddle
||
true
ctest
--output-on-failure
fi
if
[
${
WITH_TESTING
:-
OFF
}
==
"ON"
]
&&
[
${
RUN_TEST
:-
OFF
}
==
"ON"
]
;
then
cat
<<
EOF
cat
<<
EOF
========================================
========================================
Installing
...
Running unit tests
...
========================================
========================================
EOF
EOF
make
install
-j
`
nproc
`
# make install should also be test when unittest
pip
install
/usr/local/opt/paddle/share/wheels/
*
.whl
make
install
-j
`
nproc
`
paddle version
pip
install
/usr/local/opt/paddle/share/wheels/
*
.whl
paddle version
ctest
--output-on-failure
fi
# To build documentation, we need to run cmake again after installing
# To build documentation, we need to run cmake again after installing
# PaddlePaddle. This awkwardness is due to
# PaddlePaddle. This awkwardness is due to
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# describes a solution.
# describes a solution.
if
[[
${
WITH_DOC
}
==
"ON"
]]
;
then
if
[[
${
WITH_DOC
:-
OFF
}
==
"ON"
]]
;
then
cat
<<
EOF
cat
<<
EOF
========================================
========================================
Building documentation ...
Building documentation ...
In /paddle/build_doc
In /paddle/build_doc
========================================
========================================
EOF
EOF
# build documentation need install Paddle before
make
install
-j
`
nproc
`
pip
install
/usr/local/opt/paddle/share/wheels/
*
.whl
paddle version
mkdir
-p
/paddle/build_doc
mkdir
-p
/paddle/build_doc
pushd
/paddle/build_doc
pushd
/paddle/build_doc
cmake ..
\
cmake ..
\
...
@@ -117,13 +125,22 @@ fi
...
@@ -117,13 +125,22 @@ fi
# generate deb package for current build
# generate deb package for current build
# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
cat
<<
EOF
if
[[
${
WITH_DEB
:-
OFF
}
==
"ON"
]]
;
then
cat
<<
EOF
========================================
========================================
Generating .deb package ...
Generating .deb package ...
========================================
========================================
EOF
EOF
cpack
-D
CPACK_GENERATOR
=
'DEB'
-j
`
nproc
`
..
set
+e
cpack
-D
CPACK_GENERATOR
=
'DEB'
-j
`
nproc
`
..
err_code
=
$?
if
[
${
err_code
}
-ne
0
]
;
then
# cat error logs if cpack failed.
cat
/paddle/build/_CPack_Packages/Linux/DEB/PreinstallOutput.log
exit
${
err_code
}
fi
set
-e
fi
cat
<<
EOF
cat
<<
EOF
========================================
========================================
...
...
paddle/scripts/run_python_tests.sh
已删除
100755 → 0
浏览文件 @
98a83cd2
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pushd
`
dirname
$0
`
>
/dev/null
SCRIPTPATH
=
$PWD
popd
>
/dev/null
USE_VIRTUALENV_FOR_TEST
=
$1
;
shift
PYTHON
=
$1
;
shift
if
[
$USE_VIRTUALENV_FOR_TEST
-ne
0
]
;
then
rm
-rf
.test_env
virtualenv .test_env
unset
PYTHONHOME
unset
PYTHONPATH
source
.test_env/bin/activate
PYTHON
=
python
fi
$PYTHON
-m
pip
install
$SCRIPTPATH
/../dist/
*
.whl
if
[
"X
${
PADDLE_PACKAGE_DIR
}
"
!=
"X"
]
;
then
$PYTHON
-m
pip
install
${
PADDLE_PACKAGE_DIR
}
/
*
.whl
else
export
PYTHONPATH
=
$SCRIPTPATH
/../../python/
fi
$PYTHON
-m
pip
install
ipython
==
5.3
for
fn
in
"
$@
"
do
echo
"test
$fn
"
$PYTHON
$fn
if
[
$?
-ne
0
]
;
then
exit
1
fi
done
if
[
$USE_VIRTUALENV_FOR_TEST
-ne
0
]
;
then
deactivate
rm
-rf
.test_env
fi
paddle/setup.py.in
浏览文件 @
59a8ebc6
...
@@ -22,7 +22,9 @@ setup(name="py_paddle",
...
@@ -22,7 +22,9 @@ setup(name="py_paddle",
package_data={'py_paddle':['*.py','_swig_paddle.so']},
package_data={'py_paddle':['*.py','_swig_paddle.so']},
install_requires = [
install_requires = [
'nltk>=3.2.2',
'nltk>=3.2.2',
'numpy>=1.8.0', # The numpy is required.
# We use `numpy.flip` in `test_image.py`.
# `numpy.flip` is introduced in `1.12.0`
'numpy>=1.12.0', # The numpy is required.
'protobuf==${PROTOBUF_VERSION}' # The paddle protobuf version
'protobuf==${PROTOBUF_VERSION}' # The paddle protobuf version
],
],
url='http://www.paddlepaddle.org/',
url='http://www.paddlepaddle.org/',
...
...
paddle/trainer/tests/compare_sparse_data
0 → 100644
浏览文件 @
59a8ebc6
文件已添加
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
→
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
_data
浏览文件 @
59a8ebc6
文件已移动
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
浏览文件 @
59a8ebc6
./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
_data
paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
0 → 100644
浏览文件 @
59a8ebc6
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
# Note: when making change to this file, please make sure
# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
# for comparing these two nets can pass (test_CompareTwoNets)
default_initial_std
(
0
.
1
)
default_device
(
0
)
word_dim
=
999
l1
=
0
l2
=
0
model_type
(
"nn"
)
sparse_update
=
get_config_arg
(
"sparse_update"
,
bool
,
False
)
TrainData
(
ProtoData
(
type
=
"proto_sequence"
,
files
= (
'trainer/tests/train_sparse.list'
),
))
Settings
(
algorithm
=
'sgd'
,
batch_size
=
100
,
learning_rate
=
0
.
0001
,
learning_rate_decay_a
=
4
e
-
08
,
learning_rate_decay_b
=
0
.
0
,
learning_rate_schedule
=
'poly'
,
)
wordvec_dim
=
32
layer2_dim
=
16
layer3_dim
=
16
hidden_dim
=
32
slot_names
= [
"qb"
,
"qw"
,
"tb"
,
"tw"
]
def
ltr_network
(
network_name
,
word_dim
=
word_dim
,
wordvec_dim
=
wordvec_dim
,
layer2_dim
=
layer2_dim
,
layer3_dim
=
layer3_dim
,
hidden_dim
=
hidden_dim
,
slot_names
=
slot_names
,
l1
=
l1
,
l2
=
l2
):
slotnum
=
len
(
slot_names
)
for
i
in
xrange
(
slotnum
):
Inputs
(
slot_names
[
i
] +
network_name
)
for
i
in
xrange
(
slotnum
):
Layer
(
name
=
slot_names
[
i
] +
network_name
,
type
=
"data"
,
size
=
word_dim
,
device
= -
1
,
)
Layer
(
name
=
slot_names
[
i
] +
"_embedding_"
+
network_name
,
type
=
"mixed"
,
size
=
wordvec_dim
,
bias
=
False
,
device
= -
1
,
inputs
=
TableProjection
(
slot_names
[
i
] +
network_name
,
parameter_name
=
"embedding.w0"
,
decay_rate_l1
=
l1
,
sparse_remote_update
=
True
,
sparse_update
=
sparse_update
,
),
)
Layer
(
name
=
slot_names
[
i
] +
"_rnn1_"
+
network_name
,
type
=
"recurrent"
,
active_type
=
"tanh"
,
bias
=
Bias
(
initial_std
=
0
,
parameter_name
=
"rnn1.bias"
),
inputs
=
Input
(
slot_names
[
i
] +
"_embedding_"
+
network_name
,
parameter_name
=
"rnn1.w0"
)
)
Layer
(
name
=
slot_names
[
i
] +
"_rnnlast_"
+
network_name
,
type
=
"seqlastins"
,
inputs
= [
slot_names
[
i
] +
"_rnn1_"
+
network_name
,
],
)
Layer
(
name
=
"layer2_"
+
network_name
,
type
=
"fc"
,
active_type
=
"tanh"
,
size
=
layer2_dim
,
bias
=
Bias
(
parameter_name
=
"layer2.bias"
),
inputs
= [
Input
(
slot_name
+
"_rnnlast_"
+
network_name
,
parameter_name
=
"_layer2_"
+
slot_name
+
".w"
,
decay_rate
=
l2
,
initial_smart
=
True
)
for
slot_name
in
slot_names
]
)
Layer
(
name
=
"layer3_"
+
network_name
,
type
=
"fc"
,
active_type
=
"tanh"
,
size
=
layer3_dim
,
bias
=
Bias
(
parameter_name
=
"layer3.bias"
),
inputs
= [
Input
(
"layer2_"
+
network_name
,
parameter_name
=
"_layer3.w"
,
decay_rate
=
l2
,
initial_smart
=
True
),
]
)
Layer
(
name
=
"output_"
+
network_name
,
type
=
"fc"
,
size
=
1
,
bias
=
False
,
inputs
= [
Input
(
"layer3_"
+
network_name
,
parameter_name
=
"_layerO.w"
),
],
)
ltr_network
(
"left"
)
ltr_network
(
"right"
)
Inputs
(
"label"
)
Layer
(
name
=
"label"
,
type
=
"data"
,
size
=
1
,
)
Outputs
(
"cost"
,
"qb_rnnlast_left"
)
Layer
(
name
=
"cost"
,
type
=
"rank-cost"
,
inputs
= [
"output_left"
,
"output_right"
,
"label"
],
)
paddle/trainer/tests/test_CompareSparse.cpp
浏览文件 @
59a8ebc6
...
@@ -23,7 +23,7 @@ using namespace paddle; // NOLINT
...
@@ -23,7 +23,7 @@ using namespace paddle; // NOLINT
using
namespace
std
;
// NOLINT
using
namespace
std
;
// NOLINT
static
const
string
&
configFile1
=
static
const
string
&
configFile1
=
"trainer/tests/sample_trainer_config_
qb_rnn
.conf"
;
"trainer/tests/sample_trainer_config_
compare_sparse
.conf"
;
DECLARE_bool
(
use_gpu
);
DECLARE_bool
(
use_gpu
);
DECLARE_string
(
config
);
DECLARE_string
(
config
);
...
...
paddle/trainer/tests/train_sparse.list
0 → 100644
浏览文件 @
59a8ebc6
trainer/tests/compare_sparse_data
proto/DataConfig.proto
浏览文件 @
59a8ebc6
...
@@ -15,14 +15,13 @@ syntax = "proto2";
...
@@ -15,14 +15,13 @@ syntax = "proto2";
package
paddle
;
package
paddle
;
message
FileGroupConf
{
message
FileGroupConf
{
optional
uint32
queue_capacity
=
1
[
default
=
1
];
optional
uint32
queue_capacity
=
1
[
default
=
1
];
// how many files to load for a load file thread
// how many files to load for a load file thread
optional
int32
load_file_count
=
2
[
default
=
1
];
optional
int32
load_file_count
=
2
[
default
=
1
];
// how many threads to load files
// how many threads to load files
// Setting to be 5~10 is appropriate when loading files by hadoop vfs
// Setting to be 5~10 is appropriate when loading files by hadoop vfs
optional
int32
load_thread_num
=
3
[
default
=
1
];
optional
int32
load_thread_num
=
3
[
default
=
1
];
};
};
message
DataConfig
{
message
DataConfig
{
...
@@ -32,26 +31,28 @@ message DataConfig {
...
@@ -32,26 +31,28 @@ message DataConfig {
// name of a text file which contains a list of file names at each line
// name of a text file which contains a list of file names at each line
optional
string
files
=
3
;
optional
string
files
=
3
;
optional
int32
feat_dim
=
4
;
//
feature dimension of one frame
optional
int32
feat_dim
=
4
;
//
feature dimension of one frame
repeated
int32
slot_dims
=
5
;
//
feature slot dims
repeated
int32
slot_dims
=
5
;
//
feature slot dims
optional
int32
context_len
=
6
;
//
max neibour frame numbers
optional
int32
context_len
=
6
;
//
max neibour frame numbers
optional
uint64
buffer_capacity
=
7
;
//
the number of samples
optional
uint64
buffer_capacity
=
7
;
//
the number of samples
//part of data used in training
//
part of data used in training
//if not -1, part of train data is used in training
//
if not -1, part of train data is used in training
optional
int64
train_sample_num
=
8
[
default
=
-
1
];
optional
int64
train_sample_num
=
8
[
default
=
-
1
];
//The number of documents processed once
//
The number of documents processed once
optional
int32
file_load_num
=
9
[
default
=
-
1
];
optional
int32
file_load_num
=
9
[
default
=
-
1
];
optional
bool
async_load_data
=
12
[
default
=
false
];
optional
bool
async_load_data
=
12
[
default
=
false
];
/// Note the field number 10, 11 and 13 have been deprecated.
/// Note the field number 10, 11 and 13 have been deprecated.
optional
bool
for_test
=
14
[
default
=
false
];
// whether this data is for test
optional
bool
for_test
=
14
[
default
=
false
];
// whether this data is for test
optional
FileGroupConf
file_group_conf
=
15
;
optional
FileGroupConf
file_group_conf
=
15
;
repeated
int32
float_slot_dims
=
16
;
repeated
int32
float_slot_dims
=
16
;
/// Note the field number 17, 18 and 19 have been deprecated.
/// Note the field number 17, 18 and 19 have been deprecated.
// a list of values which will be used to create additional one dimensional float
// a list of values which will be used to create additional one dimensional
// float
// values slots. These one dimensional slots can be used as the weight input
// values slots. These one dimensional slots can be used as the weight input
// for cost layers.
// for cost layers.
// Currently this is only supported by ProtoDataProvider.
// Currently this is only supported by ProtoDataProvider.
...
@@ -65,21 +66,21 @@ message DataConfig {
...
@@ -65,21 +66,21 @@ message DataConfig {
// for MultiDataProvider
// for MultiDataProvider
repeated
DataConfig
sub_data_configs
=
24
;
// sub dataproviders
repeated
DataConfig
sub_data_configs
=
24
;
// sub dataproviders
/*
/*
* the ratio of each sub dataproviders:
* the ratio of each sub dataproviders:
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
* then each mini-batch is combined by 10 instance from A and 90 instances
* then each mini-batch is combined by 10 instance from A and 90 instances
* from B.
* from B.
*/
*/
optional
int32
data_ratio
=
25
;
optional
int32
data_ratio
=
25
;
/*
/*
* if one of the sub dataproviders is running out of data, then
* if one of the sub dataproviders is running out of data, then
* (1) it is "main data", then finish current pass.
* (1) it is "main data", then finish current pass.
* (2) it is not "main data", then reset it, and try getNextBatch again.
* (2) it is not "main data", then reset it, and try getNextBatch again.
*/
*/
optional
bool
is_main_data
=
26
[
default
=
true
];
optional
bool
is_main_data
=
26
[
default
=
true
];
// the usage ratio of instances. Setting to 1.0 means the use of all instances.
// the usage ratio of instances. Setting to 1.0 means the use of all
optional
double
usage_ratio
=
27
[
default
=
1.0
];
// instances.
optional
double
usage_ratio
=
27
[
default
=
1.0
];
};
};
proto/DataFormat.proto
浏览文件 @
59a8ebc6
...
@@ -17,27 +17,32 @@ package paddle;
...
@@ -17,27 +17,32 @@ package paddle;
/*
/*
If values is not empty and ids is empty, this is a dense vector.
If values is not empty and ids is empty, this is a dense vector.
If values is not empty and ids is not empty, this is a sparse vector. The position of each value
If values is not empty and ids is not empty, this is a sparse vector. The
position of each value
is specified by ids.
is specified by ids.
If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
If values is empty and ids is not empty, this is a sparse vector whose non-zero
values are 1.
The position of each 1 is specified by ids.
The position of each 1 is specified by ids.
*/
*/
message
VectorSlot
{
message
VectorSlot
{
repeated
float
values
=
1
[
packed
=
true
];
repeated
float
values
=
1
[
packed
=
true
];
repeated
uint32
ids
=
2
[
packed
=
true
];
repeated
uint32
ids
=
2
[
packed
=
true
];
/* For multidimensional data, for example "image width height depth" */
/* For multidimensional data, for example "image width height depth" */
repeated
uint32
dims
=
3
[
packed
=
true
];
repeated
uint32
dims
=
3
[
packed
=
true
];
repeated
string
strs
=
4
;
repeated
string
strs
=
4
;
};
};
/*
/*
SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
SubseqSlot use to record whether VectorSlot or any other slot in future has
If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
subseq.
One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too.
If not all VectorSlot have subseq, we only store the one who has subseq, and
use *slot_id* to record it.
One vector_slots has one sequence, and it may have N subseq, thus the number of
*lens* will be N too.
*/
*/
message
SubseqSlot
{
message
SubseqSlot
{
required
uint32
slot_id
=
1
;
//the id of slot who has subseq
required
uint32
slot_id
=
1
;
//
the id of slot who has subseq
repeated
uint32
lens
=
2
;
// lengths of sub-sequence in the slot
repeated
uint32
lens
=
2
;
// lengths of sub-sequence in the slot
};
};
message
SlotDef
{
message
SlotDef
{
...
@@ -45,13 +50,14 @@ message SlotDef {
...
@@ -45,13 +50,14 @@ message SlotDef {
VECTOR_DENSE
=
0
;
VECTOR_DENSE
=
0
;
VECTOR_SPARSE_NON_VALUE
=
1
;
VECTOR_SPARSE_NON_VALUE
=
1
;
VECTOR_SPARSE_VALUE
=
2
;
VECTOR_SPARSE_VALUE
=
2
;
INDEX
=
3
;
// This can be used as label, or word id, etc.
INDEX
=
3
;
// This can be used as label, or word id, etc.
VAR_MDIM_DENSE
=
4
;
VAR_MDIM_DENSE
=
4
;
VAR_MDIM_INDEX
=
5
;
VAR_MDIM_INDEX
=
5
;
STRING
=
6
;
STRING
=
6
;
}
}
required
SlotType
type
=
1
;
required
SlotType
type
=
1
;
required
uint32
dim
=
2
;
// For INDEX slots, this means the maximal index plus 1.
required
uint32
dim
=
2
;
// For INDEX slots, this means the maximal index plus 1.
};
};
message
DataHeader
{
message
DataHeader
{
...
@@ -60,11 +66,11 @@ message DataHeader {
...
@@ -60,11 +66,11 @@ message DataHeader {
};
};
message
DataSample
{
message
DataSample
{
optional
bool
is_beginning
=
1
[
default
=
true
];
// is the beginning of a sequence
optional
bool
is_beginning
=
1
[
default
=
true
];
// is the beginning of a sequence
repeated
VectorSlot
vector_slots
=
2
;
repeated
VectorSlot
vector_slots
=
2
;
repeated
uint32
id_slots
=
3
[
packed
=
true
];
repeated
uint32
id_slots
=
3
[
packed
=
true
];
/* use ids of VectorSlot */
/* use ids of VectorSlot */
repeated
VectorSlot
var_id_slots
=
4
;
repeated
VectorSlot
var_id_slots
=
4
;
repeated
SubseqSlot
subseq_slots
=
5
;
repeated
SubseqSlot
subseq_slots
=
5
;
};
};
proto/ModelConfig.proto
浏览文件 @
59a8ebc6
...
@@ -21,7 +21,6 @@ package paddle;
...
@@ -21,7 +21,6 @@ package paddle;
* Various structs for the configuration of a neural network
* Various structs for the configuration of a neural network
*/
*/
message
ExternalConfig
{
message
ExternalConfig
{
repeated
string
layer_names
=
1
;
repeated
string
layer_names
=
1
;
repeated
string
input_layer_names
=
2
;
repeated
string
input_layer_names
=
2
;
...
@@ -68,7 +67,7 @@ message ConvConfig {
...
@@ -68,7 +67,7 @@ message ConvConfig {
required
uint32
img_size
=
8
;
required
uint32
img_size
=
8
;
// caffe mode for output size coherence
// caffe mode for output size coherence
required
bool
caffe_mode
=
9
[
default
=
true
];
required
bool
caffe_mode
=
9
[
default
=
true
];
// if filter_size_y is set , this convolutional layer will use
// if filter_size_y is set , this convolutional layer will use
// filters of size filter_size * filter_size_y pixels.
// filters of size filter_size * filter_size_y pixels.
...
@@ -99,7 +98,7 @@ message PoolConfig {
...
@@ -99,7 +98,7 @@ message PoolConfig {
optional
uint32
start
=
4
;
optional
uint32
start
=
4
;
// Defines the stride size between successive pooling squares.
// Defines the stride size between successive pooling squares.
required
uint32
stride
=
5
[
default
=
1
];
required
uint32
stride
=
5
[
default
=
1
];
// The size of output feature map.
// The size of output feature map.
required
uint32
output_x
=
6
;
required
uint32
output_x
=
6
;
...
@@ -109,7 +108,7 @@ message PoolConfig {
...
@@ -109,7 +108,7 @@ message PoolConfig {
// padding = 4, instructs the net to implicitly
// padding = 4, instructs the net to implicitly
// pad the images with a 4-pixel border of zeros.
// pad the images with a 4-pixel border of zeros.
optional
uint32
padding
=
8
[
default
=
0
];
optional
uint32
padding
=
8
[
default
=
0
];
// if not set, use size_x
// if not set, use size_x
optional
uint32
size_y
=
9
;
optional
uint32
size_y
=
9
;
...
@@ -194,9 +193,7 @@ message MaxOutConfig {
...
@@ -194,9 +193,7 @@ message MaxOutConfig {
required
uint32
groups
=
2
;
required
uint32
groups
=
2
;
}
}
message
RowConvConfig
{
message
RowConvConfig
{
required
uint32
context_length
=
1
;
}
required
uint32
context_length
=
1
;
}
message
SliceConfig
{
message
SliceConfig
{
required
uint32
start
=
1
;
required
uint32
start
=
1
;
...
@@ -212,14 +209,14 @@ message ProjectionConfig {
...
@@ -212,14 +209,14 @@ message ProjectionConfig {
// For ShiftProjection
// For ShiftProjection
optional
int32
context_start
=
5
;
optional
int32
context_start
=
5
;
optional
int32
context_length
=
6
;
optional
int32
context_length
=
6
;
optional
bool
trainable_padding
=
7
[
default
=
false
];
optional
bool
trainable_padding
=
7
[
default
=
false
];
// For convolution
// For convolution
optional
ConvConfig
conv_conf
=
8
;
optional
ConvConfig
conv_conf
=
8
;
optional
int32
num_filters
=
9
;
optional
int32
num_filters
=
9
;
// For IdentityOffsetProjection
// For IdentityOffsetProjection
optional
uint64
offset
=
11
[
default
=
0
];
optional
uint64
offset
=
11
[
default
=
0
];
// For pool
// For pool
optional
PoolConfig
pool_conf
=
12
;
optional
PoolConfig
pool_conf
=
12
;
...
@@ -236,7 +233,7 @@ message OperatorConfig {
...
@@ -236,7 +233,7 @@ message OperatorConfig {
required
uint64
output_size
=
4
;
required
uint64
output_size
=
4
;
// For DotMulOperator
// For DotMulOperator
optional
double
dotmul_scale
=
5
[
default
=
1.0
];
optional
double
dotmul_scale
=
5
[
default
=
1.0
];
// For ConvOperator
// For ConvOperator
optional
ConvConfig
conv_conf
=
6
;
optional
ConvConfig
conv_conf
=
6
;
...
@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
...
@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
required
float
neg_overlap
=
4
;
required
float
neg_overlap
=
4
;
required
uint32
background_id
=
5
;
required
uint32
background_id
=
5
;
required
uint32
input_num
=
6
;
required
uint32
input_num
=
6
;
optional
uint32
height
=
7
[
default
=
1
];
optional
uint32
height
=
7
[
default
=
1
];
optional
uint32
width
=
8
[
default
=
1
];
optional
uint32
width
=
8
[
default
=
1
];
}
}
message
DetectionOutputConfig
{
message
DetectionOutputConfig
{
...
@@ -294,8 +291,8 @@ message DetectionOutputConfig {
...
@@ -294,8 +291,8 @@ message DetectionOutputConfig {
required
uint32
input_num
=
5
;
required
uint32
input_num
=
5
;
required
uint32
keep_top_k
=
6
;
required
uint32
keep_top_k
=
6
;
required
float
confidence_threshold
=
7
;
required
float
confidence_threshold
=
7
;
optional
uint32
height
=
8
[
default
=
1
];
optional
uint32
height
=
8
[
default
=
1
];
optional
uint32
width
=
9
[
default
=
1
];
optional
uint32
width
=
9
[
default
=
1
];
}
}
message
ClipConfig
{
message
ClipConfig
{
...
@@ -331,7 +328,7 @@ message LayerConfig {
...
@@ -331,7 +328,7 @@ message LayerConfig {
required
string
name
=
1
;
required
string
name
=
1
;
required
string
type
=
2
;
required
string
type
=
2
;
optional
uint64
size
=
3
;
optional
uint64
size
=
3
;
//optional ActivationConfig activation = 4;
//
optional ActivationConfig activation = 4;
optional
string
active_type
=
4
;
optional
string
active_type
=
4
;
repeated
LayerInputConfig
inputs
=
5
;
repeated
LayerInputConfig
inputs
=
5
;
optional
string
bias_parameter_name
=
6
;
optional
string
bias_parameter_name
=
6
;
...
@@ -344,7 +341,7 @@ message LayerConfig {
...
@@ -344,7 +341,7 @@ message LayerConfig {
// (which is how convnets are usually trained). Setting this to
// (which is how convnets are usually trained). Setting this to
// false will untie the biases, yielding a separate bias for
// false will untie the biases, yielding a separate bias for
// every location at which the filter is applied.
// every location at which the filter is applied.
optional
bool
shared_biases
=
8
[
default
=
false
];
optional
bool
shared_biases
=
8
[
default
=
false
];
// Valid values are ones that divide the area of the output
// Valid values are ones that divide the area of the output
// grid in this convolutional layer. For example if this layer
// grid in this convolutional layer. For example if this layer
...
@@ -362,33 +359,35 @@ message LayerConfig {
...
@@ -362,33 +359,35 @@ message LayerConfig {
// the gpu device which the Layer's data in.
// the gpu device which the Layer's data in.
// Only used by ParallelNeuralNetork. Ignored otherwise.
// Only used by ParallelNeuralNetork. Ignored otherwise.
optional
int32
device
=
12
[
default
=
-
1
];
optional
int32
device
=
12
[
default
=
-
1
];
// for recurrent layer. If true, the recurrence runs from the end to the beginning.
// for recurrent layer. If true, the recurrence runs from the end to the
optional
bool
reversed
=
13
[
default
=
false
];
// beginning.
optional
bool
reversed
=
13
[
default
=
false
];
// for lstmemory layer. Different types of nodes have different activation type.
// for lstmemory layer. Different types of nodes have different activation
optional
string
active_gate_type
=
14
;
// type.
optional
string
active_gate_type
=
14
;
optional
string
active_state_type
=
15
;
optional
string
active_state_type
=
15
;
// For NCELayer
// For NCELayer
// The number of random negative labels for each sample
// The number of random negative labels for each sample
optional
int32
num_neg_samples
=
16
[
default
=
10
];
optional
int32
num_neg_samples
=
16
[
default
=
10
];
// For NCELayer
// For NCELayer
// The distribution for generating the random negative labels.
// The distribution for generating the random negative labels.
// A uniform distribution will be used if not provided
// A uniform distribution will be used if not provided
repeated
double
neg_sampling_dist
=
17
[
packed
=
true
];
repeated
double
neg_sampling_dist
=
17
[
packed
=
true
];
// For MaxLayer
// For MaxLayer
// default: output VALUE of MaxLayer. set this flag to true for output INDEX
// default: output VALUE of MaxLayer. set this flag to true for output INDEX
// INDEX will be put in Argument::value as double values.
// INDEX will be put in Argument::value as double values.
optional
bool
output_max_index
=
19
[
default
=
false
];
optional
bool
output_max_index
=
19
[
default
=
false
];
/// The filed number 20 have been deprecated.
/// The filed number 20 have been deprecated.
// For self-normalized estimation
// For self-normalized estimation
optional
double
softmax_selfnorm_alpha
=
21
[
default
=
0.1
];
optional
double
softmax_selfnorm_alpha
=
21
[
default
=
0.1
];
/// The filed numbers 22 and 23 have been deprecated.
/// The filed numbers 22 and 23 have been deprecated.
...
@@ -399,14 +398,14 @@ message LayerConfig {
...
@@ -399,14 +398,14 @@ message LayerConfig {
optional
bool
norm_by_times
=
25
;
optional
bool
norm_by_times
=
25
;
// for CostLayers
// for CostLayers
optional
double
coeff
=
26
[
default
=
1.0
];
optional
double
coeff
=
26
[
default
=
1.0
];
// for AverageLayer
// for AverageLayer
// can be set to: 'average', 'sum' or 'squarerootn'
// can be set to: 'average', 'sum' or 'squarerootn'
optional
string
average_strategy
=
27
;
optional
string
average_strategy
=
27
;
// for error clipping
// for error clipping
optional
double
error_clipping_threshold
=
28
[
default
=
0.0
];
optional
double
error_clipping_threshold
=
28
[
default
=
0.0
];
// for operators used by mixed layer
// for operators used by mixed layer
repeated
OperatorConfig
operator_confs
=
29
;
repeated
OperatorConfig
operator_confs
=
29
;
...
@@ -434,43 +433,44 @@ message LayerConfig {
...
@@ -434,43 +433,44 @@ message LayerConfig {
optional
uint32
beam_size
=
39
;
optional
uint32
beam_size
=
39
;
// for seqlastins layer, whether select first instead last
// for seqlastins layer, whether select first instead last
optional
bool
select_first
=
40
[
default
=
false
];
optional
bool
select_first
=
40
[
default
=
false
];
// for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
// for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
// can be set to: 'non-seq','seq'
// can be set to: 'non-seq','seq'
optional
string
trans_type
=
41
[
default
=
'non-seq'
];
optional
string
trans_type
=
41
[
default
=
'non-seq'
];
// to indicate whether selective_fc layer
// to indicate whether selective_fc layer
// is used in sequence generation or not
// is used in sequence generation or not
optional
bool
selective_fc_pass_generation
=
42
[
default
=
false
];
optional
bool
selective_fc_pass_generation
=
42
[
default
=
false
];
// to indicate whether selective_fc layer take its last input to
// to indicate whether selective_fc layer take its last input to
// selected several columns and only compute the multiplications
// selected several columns and only compute the multiplications
// between the input matrices and the selected columns of
// between the input matrices and the selected columns of
// the parameter matrices of this layer.
// the parameter matrices of this layer.
// if set false, selective_fc degrades into fc.
// if set false, selective_fc degrades into fc.
optional
bool
has_selected_colums
=
43
[
default
=
true
];
optional
bool
has_selected_colums
=
43
[
default
=
true
];
// this parameter is for speed consideration.
// this parameter is for speed consideration.
// if number of the selected columns is less than
// if number of the selected columns is less than
// sample number * selective_fc output size * selective_fc_mull_mull_ratio
// sample number * selective_fc output size * selective_fc_mull_mull_ratio
// sparse multiplication is used, otherwise, using full multiplication.
// sparse multiplication is used, otherwise, using full multiplication.
optional
double
selective_fc_full_mul_ratio
=
44
[
default
=
0.02
];
optional
double
selective_fc_full_mul_ratio
=
44
[
default
=
0.02
];
// to indicate how many threads selective_fc use to to accelate
// to indicate how many threads selective_fc use to to accelate
// the plain_mul period
// the plain_mul period
// leave empty or set to 0 to disable multi-thread accleleration
// leave empty or set to 0 to disable multi-thread accleleration
optional
uint32
selective_fc_parallel_plain_mul_thread_num
=
45
[
default
=
0
];
optional
uint32
selective_fc_parallel_plain_mul_thread_num
=
45
[
default
=
0
];
// for batch normalization layer
// for batch normalization layer
// if set use_global_stats true, will use the loaded mean and variance.
// if set use_global_stats true, will use the loaded mean and variance.
optional
bool
use_global_stats
=
46
;
optional
bool
use_global_stats
=
46
;
// use to compute moving mean and variance.
// use to compute moving mean and variance.
optional
double
moving_average_fraction
=
47
[
default
=
0.9
];
optional
double
moving_average_fraction
=
47
[
default
=
0.9
];
// bias size
// bias size
optional
uint32
bias_size
=
48
[
default
=
0
];
optional
uint32
bias_size
=
48
[
default
=
0
];
// this parameter can be used as a user-defined parameter when necessary,
// this parameter can be used as a user-defined parameter when necessary,
// without changing the proto file.
// without changing the proto file.
...
@@ -485,18 +485,17 @@ message LayerConfig {
...
@@ -485,18 +485,17 @@ message LayerConfig {
optional
uint64
width
=
51
;
optional
uint64
width
=
51
;
// blank label used in ctc loss
// blank label used in ctc loss
optional
uint32
blank
=
52
[
default
=
0
];
optional
uint32
blank
=
52
[
default
=
0
];
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// controls the scope of pooling operation. can be set > 0.
// controls the scope of pooling operation. can be set > 0.
// leave empty or set to -1 to disable this stride pooling.
// leave empty or set to -1 to disable this stride pooling.
optional
int32
seq_pool_stride
=
53
[
default
=
-
1
];
optional
int32
seq_pool_stride
=
53
[
default
=
-
1
];
// for crop layer
// for crop layer
optional
int32
axis
=
54
[
default
=
2
];
optional
int32
axis
=
54
[
default
=
2
];
repeated
uint32
offset
=
55
;
repeated
uint32
offset
=
55
;
repeated
uint32
shape
=
56
;
repeated
uint32
shape
=
56
;
}
}
message
EvaluatorConfig
{
message
EvaluatorConfig
{
...
@@ -512,9 +511,9 @@ message EvaluatorConfig {
...
@@ -512,9 +511,9 @@ message EvaluatorConfig {
// Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
// Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
// For multi binary labels: true if output > classification_threshold
// For multi binary labels: true if output > classification_threshold
optional
double
classification_threshold
=
6
[
default
=
0.5
];
optional
double
classification_threshold
=
6
[
default
=
0.5
];
// The positive label. -1 means average precision and recall
// The positive label. -1 means average precision and recall
optional
int32
positive_label
=
7
[
default
=
-
1
];
optional
int32
positive_label
=
7
[
default
=
-
1
];
// load dict from this file
// load dict from this file
optional
string
dict_file
=
8
;
optional
string
dict_file
=
8
;
...
@@ -523,10 +522,10 @@ message EvaluatorConfig {
...
@@ -523,10 +522,10 @@ message EvaluatorConfig {
optional
string
result_file
=
9
;
optional
string
result_file
=
9
;
// top # results for max id printer
// top # results for max id printer
optional
int32
num_results
=
10
[
default
=
1
];
optional
int32
num_results
=
10
[
default
=
1
];
// whether to delimit the sequence in the seq_text_printer
// whether to delimit the sequence in the seq_text_printer
optional
bool
delimited
=
11
[
default
=
true
];
optional
bool
delimited
=
11
[
default
=
true
];
// Used by ChunkEvaluator
// Used by ChunkEvaluator
// chunk of these types are not counted
// chunk of these types are not counted
...
@@ -534,23 +533,23 @@ message EvaluatorConfig {
...
@@ -534,23 +533,23 @@ message EvaluatorConfig {
// Used by ClassificationErrorEvaluator
// Used by ClassificationErrorEvaluator
// top # classification error
// top # classification error
optional
int32
top_k
=
13
[
default
=
1
];
optional
int32
top_k
=
13
[
default
=
1
];
// Used by DetectionMAPEvaluator
// Used by DetectionMAPEvaluator
optional
double
overlap_threshold
=
14
[
default
=
0.5
];
optional
double
overlap_threshold
=
14
[
default
=
0.5
];
optional
int32
background_id
=
15
[
default
=
0
];
optional
int32
background_id
=
15
[
default
=
0
];
optional
bool
evaluate_difficult
=
16
[
default
=
false
];
optional
bool
evaluate_difficult
=
16
[
default
=
false
];
optional
string
ap_type
=
17
[
default
=
"11point"
];
optional
string
ap_type
=
17
[
default
=
"11point"
];
}
}
message
LinkConfig
{
message
LinkConfig
{
required
string
layer_name
=
1
;
required
string
layer_name
=
1
;
required
string
link_name
=
2
;
required
string
link_name
=
2
;
// If true, this link has sub-sequence
// If true, this link has sub-sequence
optional
bool
has_subseq
=
3
[
default
=
false
];
optional
bool
has_subseq
=
3
[
default
=
false
];
}
}
message
MemoryConfig
{
message
MemoryConfig
{
...
@@ -563,18 +562,18 @@ message MemoryConfig {
...
@@ -563,18 +562,18 @@ message MemoryConfig {
optional
uint32
boot_with_const_id
=
7
;
optional
uint32
boot_with_const_id
=
7
;
// memory is a sequence, initailized by a sequence boot layer
// memory is a sequence, initailized by a sequence boot layer
optional
bool
is_sequence
=
6
[
default
=
false
];
optional
bool
is_sequence
=
6
[
default
=
false
];
}
}
message
GeneratorConfig
{
message
GeneratorConfig
{
required
uint32
max_num_frames
=
1
;
required
uint32
max_num_frames
=
1
;
required
string
eos_layer_name
=
2
;
required
string
eos_layer_name
=
2
;
optional
int32
num_results_per_sample
=
3
[
default
=
1
];
optional
int32
num_results_per_sample
=
3
[
default
=
1
];
// for beam search
// for beam search
optional
int32
beam_size
=
4
[
default
=
1
];
optional
int32
beam_size
=
4
[
default
=
1
];
optional
bool
log_prob
=
5
[
default
=
true
];
optional
bool
log_prob
=
5
[
default
=
true
];
}
}
message
SubModelConfig
{
message
SubModelConfig
{
...
@@ -584,10 +583,10 @@ message SubModelConfig {
...
@@ -584,10 +583,10 @@ message SubModelConfig {
repeated
string
output_layer_names
=
4
;
repeated
string
output_layer_names
=
4
;
repeated
string
evaluator_names
=
5
;
repeated
string
evaluator_names
=
5
;
optional
bool
is_recurrent_layer_group
=
6
[
default
=
false
];
optional
bool
is_recurrent_layer_group
=
6
[
default
=
false
];
// If true, the recurrence runs from the end to the beginning.
// If true, the recurrence runs from the end to the beginning.
optional
bool
reversed
=
7
[
default
=
false
];
optional
bool
reversed
=
7
[
default
=
false
];
// name and link name of memory
// name and link name of memory
repeated
MemoryConfig
memories
=
8
;
repeated
MemoryConfig
memories
=
8
;
...
@@ -601,14 +600,15 @@ message SubModelConfig {
...
@@ -601,14 +600,15 @@ message SubModelConfig {
optional
GeneratorConfig
generator
=
11
;
optional
GeneratorConfig
generator
=
11
;
// the id of inlink which share info with outlinks, used in recurrent layer group
// the id of inlink which share info with outlinks, used in recurrent layer
// group
optional
int32
target_inlinkid
=
12
;
optional
int32
target_inlinkid
=
12
;
}
}
message
ModelConfig
{
message
ModelConfig
{
// type of the model.
// type of the model.
// Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
// Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
required
string
type
=
1
[
default
=
"nn"
];
required
string
type
=
1
[
default
=
"nn"
];
// layers should be ordered in such a way that the forward propagation
// layers should be ordered in such a way that the forward propagation
// can be correctly executed by going from the first layer to the last layer
// can be correctly executed by going from the first layer to the last layer
...
...
proto/OptimizerConfig.proto
浏览文件 @
59a8ebc6
syntax
=
"proto2"
;
syntax
=
"proto2"
;
option
optimize_for
=
LITE_RUNTIME
;
option
optimize_for
=
LITE_RUNTIME
;
package
paddle
;
package
paddle
;
...
@@ -9,13 +9,11 @@ message SGDConfig {
...
@@ -9,13 +9,11 @@ message SGDConfig {
// momentum: float >= 0. Parameter updates momentum.
// momentum: float >= 0. Parameter updates momentum.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// nesterov: boolean. Whether to apply Nesterov momentum.
// nesterov: boolean. Whether to apply Nesterov momentum.
optional
double
momentum
=
21
[
default
=
0.0
];
optional
double
momentum
=
21
[
default
=
0.0
];
optional
double
decay
=
23
[
default
=
0.0
];
optional
double
decay
=
23
[
default
=
0.0
];
optional
bool
nesterov
=
24
[
default
=
false
];
optional
bool
nesterov
=
24
[
default
=
false
];
}
}
message
AdadeltaConfig
{
message
AdadeltaConfig
{
// Adadelta
// Adadelta
// It is recommended to leave it at the default value.
// It is recommended to leave it at the default value.
...
@@ -23,21 +21,23 @@ message AdadeltaConfig {
...
@@ -23,21 +21,23 @@ message AdadeltaConfig {
// epsilon: float >= 0. Fuzz factor.
// epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adadelta - an adaptive learning rate
method](http://arxiv.org/abs/1212.5701)
// reference : [Adadelta - an adaptive learning rate
optional
double
rho
=
33
[
default
=
0.90
];
// method](http://arxiv.org/abs/1212.5701)
optional
double
epsilon
=
31
[
default
=
1e-5
];
optional
double
rho
=
33
[
default
=
0.90
];
optional
double
decay
=
32
[
default
=
0.0
];
optional
double
epsilon
=
31
[
default
=
1e-5
];
optional
double
decay
=
32
[
default
=
0.0
];
}
}
message
AdagradConfig
{
message
AdagradConfig
{
// Adagrad
// Adagrad
// epsilon: float >= 0.
// epsilon: float >= 0.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
// reference : [Adaptive Subgradient Methods for Online Learning and
optional
double
epsilon
=
41
[
default
=
1e-5
];
// Stochastic
optional
double
decay
=
42
[
default
=
0.0
];
// Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
optional
double
epsilon
=
41
[
default
=
1e-5
];
optional
double
decay
=
42
[
default
=
0.0
];
}
}
message
AdamConfig
{
message
AdamConfig
{
...
@@ -46,7 +46,8 @@ message AdamConfig {
...
@@ -46,7 +46,8 @@ message AdamConfig {
// beta_2: float, 0 < beta < 1. Generally close to 1.
// beta_2: float, 0 < beta < 1. Generally close to 1.
// epsilon: float >= 0. Fuzz factor.
// epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
// reference : [Adam - A Method for Stochastic
// Optimization](http://arxiv.org/abs/1412.6980v8)
optional
double
beta_1
=
41
;
optional
double
beta_1
=
41
;
optional
double
beta_2
=
42
;
optional
double
beta_2
=
42
;
optional
double
epsilon
=
43
;
optional
double
epsilon
=
43
;
...
@@ -55,32 +56,32 @@ message AdamConfig {
...
@@ -55,32 +56,32 @@ message AdamConfig {
message
ConstLrConfig
{
message
ConstLrConfig
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
}
}
message
LinearLrConfig
{
message
LinearLrConfig
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_b
=
3
;
optional
double
lr_decay_b
=
3
;
}
}
message
TensorProto
{
message
TensorProto
{
enum
DataType
{
enum
DataType
{
PADDLE_ELEMENT_TYPE_INT32
=
0
;
PADDLE_ELEMENT_TYPE_INT32
=
0
;
PADDLE_ELEMENT_TYPE_UINT32
=
1
;
PADDLE_ELEMENT_TYPE_UINT32
=
1
;
PADDLE_ELEMENT_TYPE_INT64
=
2
;
PADDLE_ELEMENT_TYPE_INT64
=
2
;
PADDLE_ELEMENT_TYPE_UINT64
=
3
;
PADDLE_ELEMENT_TYPE_UINT64
=
3
;
PADDLE_ELEMENT_TYPE_FLOAT32
=
4
;
PADDLE_ELEMENT_TYPE_FLOAT32
=
4
;
PADDLE_ELEMENT_TYPE_FLOAT64
=
5
;
PADDLE_ELEMENT_TYPE_FLOAT64
=
5
;
}
}
optional
DataType
data_type
=
1
;
optional
DataType
data_type
=
1
;
repeated
bytes
content
=
2
;
repeated
bytes
content
=
2
;
}
}
message
LrPolicyState
{
message
LrPolicyState
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_b
=
3
;
optional
double
lr_decay_b
=
3
;
}
}
...
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
...
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
optional
TensorProto
update_delta
=
4
;
optional
TensorProto
update_delta
=
4
;
}
}
message
AdagradOptimizerState
{
message
AdagradOptimizerState
{
optional
LrPolicyState
lr_state
=
101
;
optional
LrPolicyState
lr_state
=
101
;
optional
double
num_sample_passed
=
104
;
optional
double
num_sample_passed
=
104
;
...
@@ -124,10 +124,10 @@ message AdamOptimizerState {
...
@@ -124,10 +124,10 @@ message AdamOptimizerState {
message
OptimizerConfig
{
message
OptimizerConfig
{
enum
Optimizer
{
enum
Optimizer
{
SGD
=
1
;
SGD
=
1
;
Adadelta
=
2
;
Adadelta
=
2
;
Adagrad
=
3
;
Adagrad
=
3
;
Adam
=
4
;
Adam
=
4
;
}
}
optional
Optimizer
optimizer
=
1
;
optional
Optimizer
optimizer
=
1
;
optional
SGDConfig
sgd
=
3
;
optional
SGDConfig
sgd
=
3
;
...
@@ -136,8 +136,8 @@ message OptimizerConfig {
...
@@ -136,8 +136,8 @@ message OptimizerConfig {
optional
AdamConfig
adam
=
6
;
optional
AdamConfig
adam
=
6
;
enum
LrPolicy
{
enum
LrPolicy
{
Const
=
0
;
Const
=
0
;
Linear
=
1
;
Linear
=
1
;
}
}
optional
LrPolicy
lr_policy
=
11
;
optional
LrPolicy
lr_policy
=
11
;
optional
ConstLrConfig
const_lr
=
12
;
optional
ConstLrConfig
const_lr
=
12
;
...
...
proto/ParameterConfig.proto
浏览文件 @
59a8ebc6
...
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
...
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
message
ParameterUpdaterHookConfig
{
message
ParameterUpdaterHookConfig
{
// hook type such as 'pruning'
// hook type such as 'pruning'
required
string
type
=
1
;
required
string
type
=
1
;
// this represents the ratio of zero element to be set by the Parameter
// this represents the ratio of zero element to be set by the Parameter
optional
double
sparsity_ratio
=
2
[
default
=
0.6
];
optional
double
sparsity_ratio
=
2
[
default
=
0.6
];
}
}
message
ParameterConfig
{
message
ParameterConfig
{
required
string
name
=
1
;
required
string
name
=
1
;
required
uint64
size
=
2
;
required
uint64
size
=
2
;
optional
double
learning_rate
=
3
[
default
=
1.0
];
optional
double
learning_rate
=
3
[
default
=
1.0
];
optional
double
momentum
=
4
[
default
=
0.0
];
optional
double
momentum
=
4
[
default
=
0.0
];
optional
double
initial_mean
=
5
[
default
=
0.0
];
optional
double
initial_mean
=
5
[
default
=
0.0
];
optional
double
initial_std
=
6
[
default
=
0.01
];
optional
double
initial_std
=
6
[
default
=
0.01
];
// use L2-regularization if decay_rate set and decay_rate_l1 not set
// use L2-regularization if decay_rate set and decay_rate_l1 not set
optional
double
decay_rate
=
7
[
default
=
0.0
];
optional
double
decay_rate
=
7
[
default
=
0.0
];
// use L1-regularization if decay_rate_l1 set
// use L1-regularization if decay_rate_l1 set
optional
double
decay_rate_l1
=
8
[
default
=
0.0
];
optional
double
decay_rate_l1
=
8
[
default
=
0.0
];
// dims of Parameter, e.g. dims[0] as height, dims[1] as width..
// dims of Parameter, e.g. dims[0] as height, dims[1] as width..
repeated
uint64
dims
=
9
;
repeated
uint64
dims
=
9
;
// the gpu device which the parameter in.
// the gpu device which the parameter in.
// Only used by ParallelNeuralNetork. Ignored otherwise.
// Only used by ParallelNeuralNetork. Ignored otherwise.
optional
int32
device
=
10
[
default
=
-
1
];
optional
int32
device
=
10
[
default
=
-
1
];
// how to init the parameter: 0 -> normal, 1 -> uniform
// how to init the parameter: 0 -> normal, 1 -> uniform
// 0: treat initial_mean as mean, intial_std as standard deviation
// 0: treat initial_mean as mean, intial_std as standard deviation
// 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
// 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
optional
int32
initial_strategy
=
11
[
default
=
0
];
optional
int32
initial_strategy
=
11
[
default
=
0
];
// define the variance when init the parameter, by height of the Matrix
// define the variance when init the parameter, by height of the Matrix
optional
bool
initial_smart
=
12
[
default
=
false
];
optional
bool
initial_smart
=
12
[
default
=
false
];
// apply regularization every # batches
// apply regularization every # batches
optional
int32
num_batches_regularization
=
13
[
default
=
1
];
optional
int32
num_batches_regularization
=
13
[
default
=
1
];
// if is_sparse is true, para is sparse, else para is dense
// if is_sparse is true, para is sparse, else para is dense
optional
bool
is_sparse
=
14
[
default
=
false
];
optional
bool
is_sparse
=
14
[
default
=
false
];
// if para is sparse, format should be "csc" or "csr", empty means is not sparse
// if para is sparse, format should be "csc" or "csr", empty means is not
optional
string
format
=
15
[
default
=
""
];
// sparse
optional
string
format
=
15
[
default
=
""
];
// sparse remote update or not
// sparse remote update or not
optional
bool
sparse_remote_update
=
16
[
default
=
false
];
optional
bool
sparse_remote_update
=
16
[
default
=
false
];
// gradient clipping threshold, no clipping by default
// gradient clipping threshold, no clipping by default
optional
double
gradient_clipping_threshold
=
17
[
default
=
0.0
];
optional
double
gradient_clipping_threshold
=
17
[
default
=
0.0
];
// static parameters are fixed when training
// static parameters are fixed when training
optional
bool
is_static
=
18
[
default
=
false
];
optional
bool
is_static
=
18
[
default
=
false
];
// para_id should NOT be set by config_parser. It is for
// para_id should NOT be set by config_parser. It is for
// internal use.
// internal use.
optional
uint64
para_id
=
19
;
optional
uint64
para_id
=
19
;
repeated
ParameterUpdaterHookConfig
update_hooks
=
20
;
repeated
ParameterUpdaterHookConfig
update_hooks
=
20
;
// setup load mat -> csr
// setup load mat -> csr
optional
bool
need_compact
=
21
[
default
=
false
];
optional
bool
need_compact
=
21
[
default
=
false
];
// whether to do sparse update for this parameter
// whether to do sparse update for this parameter
optional
bool
sparse_update
=
22
[
default
=
false
];
optional
bool
sparse_update
=
22
[
default
=
false
];
// whether this parameter is shared or not.
// whether this parameter is shared or not.
optional
bool
is_shared
=
23
[
default
=
false
];
optional
bool
is_shared
=
23
[
default
=
false
];
// parameter block size
// parameter block size
optional
uint64
parameter_block_size
=
24
[
default
=
0
];
optional
uint64
parameter_block_size
=
24
[
default
=
0
];
}
}
proto/ParameterServerConfig.proto
浏览文件 @
59a8ebc6
...
@@ -15,13 +15,10 @@ syntax = "proto2";
...
@@ -15,13 +15,10 @@ syntax = "proto2";
package
paddle
;
package
paddle
;
/**
/**
* Configuration structure for ParameterClient2.
* Configuration structure for ParameterClient2.
*/
*/
message
ParameterClientConfig
{
message
ParameterClientConfig
{
required
int32
trainer_id
=
1
;
}
required
int32
trainer_id
=
1
;
}
/**
/**
* Configuration structure for ParameterServer2.
* Configuration structure for ParameterServer2.
...
@@ -30,24 +27,24 @@ message ParameterServerConfig {
...
@@ -30,24 +27,24 @@ message ParameterServerConfig {
// Number of ports for sending dense parameter,
// Number of ports for sending dense parameter,
// following ports on parameter server will be visited
// following ports on parameter server will be visited
// for sending dense parameter: [port, port+ports_num-1]
// for sending dense parameter: [port, port+ports_num-1]
required
int32
ports_num
=
1
[
default
=
1
];
required
int32
ports_num
=
1
[
default
=
1
];
// Number of ports for sending sparse parameter,
// Number of ports for sending sparse parameter,
// following ports on parameter server will be visited
// following ports on parameter server will be visited
// for sending sparse parameter:
// for sending sparse parameter:
// [port+ports_num, port+ports_num+ports_num_for_sparse-1]
// [port+ports_num, port+ports_num+ports_num_for_sparse-1]
required
int32
ports_num_for_sparse
=
2
[
default
=
0
];
required
int32
ports_num_for_sparse
=
2
[
default
=
0
];
// network device name for pservers
// network device name for pservers
required
string
nics
=
3
[
default
=
"xgbe0,xgbe1"
];
required
string
nics
=
3
[
default
=
"xgbe0,xgbe1"
];
required
string
rdma_tcp
=
4
[
default
=
"tcp"
];
required
string
rdma_tcp
=
4
[
default
=
"tcp"
];
// Listening port for pserver
// Listening port for pserver
required
int32
port
=
5
[
default
=
20134
];
required
int32
port
=
5
[
default
=
20134
];
// number of gradient servers
// number of gradient servers
required
int32
num_gradient_servers
=
6
[
default
=
1
];
required
int32
num_gradient_servers
=
6
[
default
=
1
];
// number of threads for sync op exec
// number of threads for sync op exec
required
int32
pserver_num_threads
=
7
[
default
=
1
];
required
int32
pserver_num_threads
=
7
[
default
=
1
];
// control config_.async_lagged_grad_discard_ratio() min value
// control config_.async_lagged_grad_discard_ratio() min value
required
double
async_lagged_ratio_min
=
8
[
default
=
1.0
];
required
double
async_lagged_ratio_min
=
8
[
default
=
1.0
];
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// use it as defalut value
// use it as defalut value
required
double
async_lagged_ratio_default
=
9
[
default
=
1.5
];
required
double
async_lagged_ratio_default
=
9
[
default
=
1.5
];
}
}
\ No newline at end of file
proto/ParameterService.proto
浏览文件 @
59a8ebc6
...
@@ -23,8 +23,8 @@ package paddle;
...
@@ -23,8 +23,8 @@ package paddle;
*/
*/
enum
ParameterUpdateMode
{
enum
ParameterUpdateMode
{
// Set parameter
// Set parameter
PSERVER_UPDATE_MODE_SET_PARAM
=
0
;
//
use local param
PSERVER_UPDATE_MODE_SET_PARAM
=
0
;
//
use local param
PSERVER_UPDATE_MODE_SET_PARAM_ZERO
=
1
;
//
set zero param
PSERVER_UPDATE_MODE_SET_PARAM_ZERO
=
1
;
//
set zero param
// Update parameter once a gradient is received
// Update parameter once a gradient is received
PSERVER_UPDATE_MODE_ASYNC_SGD
=
2
;
PSERVER_UPDATE_MODE_ASYNC_SGD
=
2
;
...
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
...
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
// No update. Only get parameters back.
// No update. Only get parameters back.
PSERVER_UPDATE_MODE_GET_PARAM
=
5
;
PSERVER_UPDATE_MODE_GET_PARAM
=
5
;
PSERVER_UPDATE_MODE_GET_PARAM_SPARSE
=
6
;
//
only get sparse rows
PSERVER_UPDATE_MODE_GET_PARAM_SPARSE
=
6
;
//
only get sparse rows
};
};
message
ParameterBlock
{
message
ParameterBlock
{
...
@@ -80,42 +80,34 @@ message SendParameterRequest {
...
@@ -80,42 +80,34 @@ message SendParameterRequest {
optional
int32
trainer_id
=
7
;
optional
int32
trainer_id
=
7
;
// send back parameter type on pserver, PARAMETER_VALUE by default
// send back parameter type on pserver, PARAMETER_VALUE by default
optional
int32
send_back_parameter_type
=
8
[
default
=
0
];
optional
int32
send_back_parameter_type
=
8
[
default
=
0
];
// forwardbackward time in usec
// forwardbackward time in usec
optional
uint64
forwardbackward_time
=
9
;
optional
uint64
forwardbackward_time
=
9
;
}
}
message
WaitPassStartRequest
{
message
WaitPassStartRequest
{}
}
message
WaitPassStartResponse
{
message
WaitPassStartResponse
{}
}
message
WaitPassFinishRequest
{
message
WaitPassFinishRequest
{}
}
message
WaitPassFinishResponse
{
message
WaitPassFinishResponse
{}
}
enum
SyncObject
{
enum
SyncObject
{
SYNC_DEFAULT
=
0
;
// wait for the synchronizeBarrier_
SYNC_DEFAULT
=
0
;
// wait for the synchronizeBarrier_
SYNC_DATA
=
1
;
// wait for the synchronizeDataBarrier_
SYNC_DATA
=
1
;
// wait for the synchronizeDataBarrier_
}
}
message
SynchronizeRequest
{
message
SynchronizeRequest
{
required
SyncObject
sync_object_id
=
1
[
default
=
SYNC_DEFAULT
];
required
SyncObject
sync_object_id
=
1
[
default
=
SYNC_DEFAULT
];
optional
int32
trainer_id
=
2
;
optional
int32
trainer_id
=
2
;
}
}
message
SynchronizeResponse
{
message
SynchronizeResponse
{}
}
message
SendParameterResponse
{
message
SendParameterResponse
{
repeated
ParameterBlock
blocks
=
1
;
}
repeated
ParameterBlock
blocks
=
1
;
}
message
SetConfigRequest
{
message
SetConfigRequest
{
repeated
ParameterConfig
param_configs
=
1
;
repeated
ParameterConfig
param_configs
=
1
;
...
@@ -125,26 +117,18 @@ message SetConfigRequest {
...
@@ -125,26 +117,18 @@ message SetConfigRequest {
required
bool
is_sparse_server
=
6
;
required
bool
is_sparse_server
=
6
;
}
}
message
SetConfigResponse
{
message
SetConfigResponse
{}
}
message
GetStatusRequest
{
message
GetStatusRequest
{}
}
message
GetStatusResponse
{
message
GetStatusResponse
{
required
PServerStatus
status
=
1
;
}
required
PServerStatus
status
=
1
;
}
message
SetStatusRequest
{
message
SetStatusRequest
{
required
PServerStatus
status
=
1
;
}
required
PServerStatus
status
=
1
;
}
message
SetStatusResponse
{
message
SetStatusResponse
{}
}
// create a column vector. The size is the dimension of parameter
// create a column vector. The size is the dimension of parameter
message
CreateVectorRequest
{
message
CreateVectorRequest
{}
}
message
CreateVectorResponse
{
message
CreateVectorResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -153,9 +137,7 @@ message CreateVectorResponse {
...
@@ -153,9 +137,7 @@ message CreateVectorResponse {
required
int64
handle
=
2
;
required
int64
handle
=
2
;
}
}
message
ReleaseVectorRequest
{
message
ReleaseVectorRequest
{
required
int64
handle
=
1
;
}
required
int64
handle
=
1
;
}
message
ReleaseVectorResponse
{
message
ReleaseVectorResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
...
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
// Create a column major matrix. The number of rows is the dimension
// Create a column major matrix. The number of rows is the dimension
// of parameter. The number of columns is specifed by num_cols
// of parameter. The number of columns is specifed by num_cols
message
CreateMatrixRequest
{
message
CreateMatrixRequest
{
required
int32
num_cols
=
1
;
}
required
int32
num_cols
=
1
;
}
message
CreateMatrixResponse
{
message
CreateMatrixResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
...
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
required
int64
handle
=
2
;
required
int64
handle
=
2
;
}
}
message
ReleaseMatrixRequest
{
message
ReleaseMatrixRequest
{
required
int64
handle
=
1
;
}
required
int64
handle
=
1
;
}
message
ReleaseMatrixResponse
{
message
ReleaseMatrixResponse
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
}
}
/**
/**
* The operations are defined using the variables commented at Operation
* The operations are defined using the variables commented at Operation
* and OperationResult
* and OperationResult
...
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
...
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
message
ProtoVector
{
message
ProtoVector
{
required
int64
dim
=
1
;
required
int64
dim
=
1
;
repeated
double
values
=
2
[
packed
=
true
];
repeated
double
values
=
2
[
packed
=
true
];
}
}
message
ProtoMatrix
{
message
ProtoMatrix
{
required
int64
num_rows
=
1
;
required
int64
num_rows
=
1
;
required
int64
num_cols
=
2
;
required
int64
num_cols
=
2
;
repeated
double
values
=
3
[
packed
=
true
];
repeated
double
values
=
3
[
packed
=
true
];
}
}
message
Operation
{
message
Operation
{
required
MatrixVectorOperation
operation
=
1
;
required
MatrixVectorOperation
operation
=
1
;
// vector handles created on the pserver
// vector handles created on the pserver
repeated
int64
pvectors
=
2
;
// u, v, w
repeated
int64
pvectors
=
2
;
// u, v, w
// matrix handles created on the pserver
// matrix handles created on the pserver
repeated
int64
pmatrices
=
3
;
// A, B, C
repeated
int64
pmatrices
=
3
;
// A, B, C
repeated
double
scalars
=
4
;
// a, b, c
repeated
double
scalars
=
4
;
// a, b, c
repeated
ProtoVector
vectors
=
5
;
// x, y, z
repeated
ProtoVector
vectors
=
5
;
// x, y, z
repeated
ProtoMatrix
matrices
=
6
;
// X, Y, Z
repeated
ProtoMatrix
matrices
=
6
;
// X, Y, Z
}
}
message
OperationResult
{
message
OperationResult
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
//
//
repeated
double
scalars
=
2
;
// d, e, f
repeated
double
scalars
=
2
;
// d, e, f
repeated
ProtoVector
vectors
=
3
;
// p, q, r
repeated
ProtoVector
vectors
=
3
;
// p, q, r
repeated
ProtoMatrix
matrices
=
4
;
// P, Q, R
repeated
ProtoMatrix
matrices
=
4
;
// P, Q, R
}
}
message
DoOperationRequest
{
message
DoOperationRequest
{
...
@@ -301,18 +278,14 @@ message DoOperationResponse {
...
@@ -301,18 +278,14 @@ message DoOperationResponse {
required
bool
pass_finish
=
3
;
required
bool
pass_finish
=
3
;
}
}
message
LoadValueRequest
{
message
LoadValueRequest
{
required
string
dir_name
=
1
;
}
required
string
dir_name
=
1
;
}
message
LoadValueResponse
{
message
LoadValueResponse
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
}
}
message
SaveValueRequest
{
message
SaveValueRequest
{
required
string
dir_name
=
1
;
}
required
string
dir_name
=
1
;
}
message
SaveValueResponse
{
message
SaveValueResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -331,11 +304,11 @@ enum DataUpdateMode {
...
@@ -331,11 +304,11 @@ enum DataUpdateMode {
// Client send it's own ref label to pserver
// Client send it's own ref label to pserver
DATA_UPDATE_MODE_SET_REF_LABEL
=
4
;
DATA_UPDATE_MODE_SET_REF_LABEL
=
4
;
// Client get all ref labels from all pservers
// Client get all ref labels from all pservers
DATA_UPDATE_MODE_GET_REF_LABEL
=
5
;
DATA_UPDATE_MODE_GET_REF_LABEL
=
5
;
// Client send it's own ref grad to pserver
// Client send it's own ref grad to pserver
DATA_UPDATE_MODE_SET_REF_GRAD
=
6
;
DATA_UPDATE_MODE_SET_REF_GRAD
=
6
;
// Client get all ref grad from all pservers
// Client get all ref grad from all pservers
DATA_UPDATE_MODE_GET_REF_GRAD
=
7
;
DATA_UPDATE_MODE_GET_REF_GRAD
=
7
;
}
}
enum
SendDataType
{
enum
SendDataType
{
...
@@ -360,7 +333,7 @@ message DataBlock {
...
@@ -360,7 +333,7 @@ message DataBlock {
// byte size of one data type
// byte size of one data type
required
int32
data_size
=
2
;
required
int32
data_size
=
2
;
// data_type
// data_type
optional
TransDataType
data_type
=
3
[
default
=
TRANS_DOUBLE
];
optional
TransDataType
data_type
=
3
[
default
=
TRANS_DOUBLE
];
}
}
message
SendDataRequest
{
message
SendDataRequest
{
...
...
proto/TrainerConfig.proto
浏览文件 @
59a8ebc6
...
@@ -20,14 +20,14 @@ package paddle;
...
@@ -20,14 +20,14 @@ package paddle;
message
OptimizationConfig
{
message
OptimizationConfig
{
required
int32
batch_size
=
3
;
required
int32
batch_size
=
3
;
required
string
algorithm
=
4
[
default
=
"async_sgd"
];
required
string
algorithm
=
4
[
default
=
"async_sgd"
];
optional
int32
num_batches_per_send_parameter
=
5
[
default
=
1
];
optional
int32
num_batches_per_send_parameter
=
5
[
default
=
1
];
optional
int32
num_batches_per_get_parameter
=
6
[
default
=
1
];
optional
int32
num_batches_per_get_parameter
=
6
[
default
=
1
];
required
double
learning_rate
=
7
;
required
double
learning_rate
=
7
;
optional
double
learning_rate_decay_a
=
8
[
default
=
0
];
optional
double
learning_rate_decay_a
=
8
[
default
=
0
];
optional
double
learning_rate_decay_b
=
9
[
default
=
0
];
optional
double
learning_rate_decay_b
=
9
[
default
=
0
];
optional
string
learning_rate_schedule
=
27
[
default
=
"constant"
];
optional
string
learning_rate_schedule
=
27
[
default
=
"constant"
];
// learning rate will be scaled according to learning_rate_schedule
// learning rate will be scaled according to learning_rate_schedule
// 1), constant:
// 1), constant:
// lr = learning_rate
// lr = learning_rate
...
@@ -49,88 +49,92 @@ message OptimizationConfig {
...
@@ -49,88 +49,92 @@ message OptimizationConfig {
// owlqn related
// owlqn related
// L1-regularization
// L1-regularization
optional
double
l1weight
=
10
[
default
=
0.1
];
optional
double
l1weight
=
10
[
default
=
0.1
];
// L2-regularization
// L2-regularization
optional
double
l2weight
=
11
[
default
=
0
];
optional
double
l2weight
=
11
[
default
=
0
];
// "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
// "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
// then accept the step
// then accept the step
optional
double
c1
=
12
[
default
=
0.0001
];
optional
double
c1
=
12
[
default
=
0.0001
];
// multiply the step with "backoff", when wolfe condition doesn't satisfy
// multiply the step with "backoff", when wolfe condition doesn't satisfy
optional
double
backoff
=
13
[
default
=
0.5
];
optional
double
backoff
=
13
[
default
=
0.5
];
// how many "s"s and "y"s are kept in owlqn
// how many "s"s and "y"s are kept in owlqn
optional
int32
owlqn_steps
=
14
[
default
=
10
];
optional
int32
owlqn_steps
=
14
[
default
=
10
];
// accept the step if encountered "max_backoff" times of "reduce the step"
// accept the step if encountered "max_backoff" times of "reduce the step"
optional
int32
max_backoff
=
15
[
default
=
5
];
optional
int32
max_backoff
=
15
[
default
=
5
];
// L2-regularization coefficient is reduced linearly from iteration 0 to
// L2-regularization coefficient is reduced linearly from iteration 0 to
// "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
// "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
// iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
// iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
optional
int32
l2weight_zero_iter
=
17
[
default
=
0
];
optional
int32
l2weight_zero_iter
=
17
[
default
=
0
];
// averaged sgd
// averaged sgd
// About average_window * numBatchProcessed parameter are used
// About average_window * numBatchProcessed parameter are used
// for average. To be accurate, between average_window * numBatchProcessed
// for average. To be accurate, between average_window * numBatchProcessed
// and 2 * average_window * numBatchProcessed parameters are used for
// and 2 * average_window * numBatchProcessed parameters are used for
// average.
// average.
optional
double
average_window
=
18
[
default
=
0
];
optional
double
average_window
=
18
[
default
=
0
];
optional
int64
max_average_window
=
19
[
default
=
0x7fffffffffffffff
];
optional
int64
max_average_window
=
19
[
default
=
0x7fffffffffffffff
];
//////////////////////////
//////////////////////////
// Options Adaptive SGD //
// Options Adaptive SGD //
//////////////////////////
//////////////////////////
// learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
// learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
// default learning method("momentum") use global decayed learning rate with momentum.
// "rmsprop"
// default learning method("momentum") use global decayed learning rate with
// momentum.
// "adagrad", "adadelta" and "rmsprop" can set momentum too.
// "adagrad", "adadelta" and "rmsprop" can set momentum too.
optional
string
learning_method
=
23
[
default
=
"momentum"
];
optional
string
learning_method
=
23
[
default
=
"momentum"
];
optional
double
ada_epsilon
=
24
[
default
=
1e-6
];
optional
double
ada_epsilon
=
24
[
default
=
1e-6
];
optional
double
ada_rou
=
26
[
default
=
0.95
];
optional
double
ada_rou
=
26
[
default
=
0.95
];
// Force to do average in cpu in order to save gpu memory usage
// Force to do average in cpu in order to save gpu memory usage
optional
bool
do_average_in_cpu
=
25
[
default
=
false
];
optional
bool
do_average_in_cpu
=
25
[
default
=
false
];
// delta add rate in pserver, used while num_batches_per_send_parameter>1
// delta add rate in pserver, used while num_batches_per_send_parameter>1
// will be divided by #machines automatically.
// will be divided by #machines automatically.
optional
double
delta_add_rate
=
28
[
default
=
1.0
];
optional
double
delta_add_rate
=
28
[
default
=
1.0
];
// We split a large size into smaller mini-batches, whose sizes are
// We split a large size into smaller mini-batches, whose sizes are
// determined by mini_batch_size. It only takes effect when there is
// determined by mini_batch_size. It only takes effect when there is
// an ExternalMachine.
// an ExternalMachine.
optional
int32
mini_batch_size
=
29
[
default
=
128
];
optional
int32
mini_batch_size
=
29
[
default
=
128
];
// automatically set if any one of parameters set sparse remote update flag
// automatically set if any one of parameters set sparse remote update flag
optional
bool
use_sparse_remote_updater
=
30
[
default
=
false
];
optional
bool
use_sparse_remote_updater
=
30
[
default
=
false
];
// how to update center parameter and feedback to local parameter,
// how to update center parameter and feedback to local parameter,
// when use local sgd update in cluster training.
// when use local sgd update in cluster training.
// A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
// A option is elastic_average, proposed by the paper: Deep learning with
// If use elastic_average method, every trainer node should sample from whole data sets.
// elastic averaging SGD.
optional
string
center_parameter_update_method
=
31
[
default
=
"average"
];
// If use elastic_average method, every trainer node should sample from whole
// data sets.
optional
string
center_parameter_update_method
=
31
[
default
=
"average"
];
// shrink sparse parameter value
// shrink sparse parameter value
// only works if parameter is remote sparse update and has L1 decay rate
// only works if parameter is remote sparse update and has L1 decay rate
optional
double
shrink_parameter_value
=
32
[
default
=
0
];
optional
double
shrink_parameter_value
=
32
[
default
=
0
];
////////////////////////////
////////////////////////////
// Options Adam Optimizer //
// Options Adam Optimizer //
////////////////////////////
////////////////////////////
optional
double
adam_beta1
=
33
[
default
=
0.9
];
optional
double
adam_beta1
=
33
[
default
=
0.9
];
optional
double
adam_beta2
=
34
[
default
=
0.999
];
optional
double
adam_beta2
=
34
[
default
=
0.999
];
optional
double
adam_epsilon
=
35
[
default
=
1e-8
];
optional
double
adam_epsilon
=
35
[
default
=
1e-8
];
// arguments for learning rate scheduler
// arguments for learning rate scheduler
// Format: num1:rate1,num2:rate2,...,numK:rateK
// Format: num1:rate1,num2:rate2,...,numK:rateK
// For learning_rate_schedule="manual", num is the number of samples,
// For learning_rate_schedule="manual", num is the number of samples,
// For learning_rate_schedule="pass_manual",
// For learning_rate_schedule="pass_manual",
// num is the number of passes (starting from 0)
// num is the number of passes (starting from 0)
optional
string
learning_rate_args
=
36
[
default
=
""
];
optional
string
learning_rate_args
=
36
[
default
=
""
];
// for async sgd gradient commit control.
// for async sgd gradient commit control.
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// current async gradient will be discard silently.
// current async gradient will be discard silently.
optional
double
async_lagged_grad_discard_ratio
=
37
[
default
=
1.5
];
optional
double
async_lagged_grad_discard_ratio
=
37
[
default
=
1.5
];
// global threshold for gradient clipping
// global threshold for gradient clipping
optional
double
gradient_clipping_threshold
=
38
[
default
=
0.0
];
optional
double
gradient_clipping_threshold
=
38
[
default
=
0.0
];
};
};
message
TrainerConfig
{
message
TrainerConfig
{
...
@@ -141,7 +145,7 @@ message TrainerConfig {
...
@@ -141,7 +145,7 @@ message TrainerConfig {
repeated
string
config_files
=
5
;
repeated
string
config_files
=
5
;
// the directory to save/load model files for each training path
// the directory to save/load model files for each training path
optional
string
save_dir
=
6
[
default
=
"./output/model"
];
optional
string
save_dir
=
6
[
default
=
"./output/model"
];
// Path of the initial model parameters.
// Path of the initial model parameters.
// If it was set, start_pass will be ignored.
// If it was set, start_pass will be ignored.
...
@@ -149,7 +153,7 @@ message TrainerConfig {
...
@@ -149,7 +153,7 @@ message TrainerConfig {
// Start training from this pass.
// Start training from this pass.
// Will load parameter from the previous pass.
// Will load parameter from the previous pass.
optional
int32
start_pass
=
8
[
default
=
0
];
optional
int32
start_pass
=
8
[
default
=
0
];
// file path to the trainer config file
// file path to the trainer config file
optional
string
config_file
=
9
;
optional
string
config_file
=
9
;
...
...
python/paddle/v2/framework/create_op_creation_methods.py
浏览文件 @
59a8ebc6
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.proto.op_proto_pb2
as
op_proto_pb2
import
paddle.v2.framework.proto.op_proto_pb2
as
op_proto_pb2
import
paddle.v2.framework.proto.op_desc_pb2
as
op_desc_pb2
import
paddle.v2.framework.proto.op_desc_pb2
as
op_desc_pb2
import
paddle.v2.framework.proto.attr
_type_pb2
as
attr_typ
e_pb2
import
paddle.v2.framework.proto.attr
ibute_pb2
as
attribut
e_pb2
import
cStringIO
import
cStringIO
...
@@ -57,7 +57,7 @@ class OpDescCreationMethod(object):
...
@@ -57,7 +57,7 @@ class OpDescCreationMethod(object):
op_desc
.
attrs
.
extend
([
out_format
])
op_desc
.
attrs
.
extend
([
out_format
])
if
len
(
tmp_index
)
!=
0
:
if
len
(
tmp_index
)
!=
0
:
tmp_index_attr
=
op_desc
.
attrs
.
add
()
tmp_index_attr
=
op_desc
.
attrs
.
add
()
tmp_index_attr
.
type
=
attr
_typ
e_pb2
.
INTS
tmp_index_attr
.
type
=
attr
ibut
e_pb2
.
INTS
tmp_index_attr
.
name
=
"temporary_index"
tmp_index_attr
.
name
=
"temporary_index"
tmp_index_attr
.
ints
.
extend
(
tmp_index
)
tmp_index_attr
.
ints
.
extend
(
tmp_index
)
...
@@ -73,17 +73,17 @@ class OpDescCreationMethod(object):
...
@@ -73,17 +73,17 @@ class OpDescCreationMethod(object):
new_attr
=
op_desc
.
attrs
.
add
()
new_attr
=
op_desc
.
attrs
.
add
()
new_attr
.
name
=
attr
.
name
new_attr
.
name
=
attr
.
name
new_attr
.
type
=
attr
.
type
new_attr
.
type
=
attr
.
type
if
attr
.
type
==
attr
_typ
e_pb2
.
INT
:
if
attr
.
type
==
attr
ibut
e_pb2
.
INT
:
new_attr
.
i
=
user_defined_attr
new_attr
.
i
=
user_defined_attr
elif
attr
.
type
==
attr
_typ
e_pb2
.
FLOAT
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
FLOAT
:
new_attr
.
f
=
user_defined_attr
new_attr
.
f
=
user_defined_attr
elif
attr
.
type
==
attr
_typ
e_pb2
.
STRING
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
STRING
:
new_attr
.
s
=
user_defined_attr
new_attr
.
s
=
user_defined_attr
elif
attr
.
type
==
attr
_typ
e_pb2
.
INTS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
INTS
:
new_attr
.
ints
.
extend
(
user_defined_attr
)
new_attr
.
ints
.
extend
(
user_defined_attr
)
elif
attr
.
type
==
attr
_typ
e_pb2
.
FLOATS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
FLOATS
:
new_attr
.
floats
.
extend
(
user_defined_attr
)
new_attr
.
floats
.
extend
(
user_defined_attr
)
elif
attr
.
type
==
attr
_typ
e_pb2
.
STRINGS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
STRINGS
:
new_attr
.
strings
.
extend
(
user_defined_attr
)
new_attr
.
strings
.
extend
(
user_defined_attr
)
else
:
else
:
raise
NotImplementedError
(
"Not support attribute type "
+
raise
NotImplementedError
(
"Not support attribute type "
+
...
@@ -109,7 +109,7 @@ class OpDescCreationMethod(object):
...
@@ -109,7 +109,7 @@ class OpDescCreationMethod(object):
retv
=
[]
retv
=
[]
if
multiple
:
if
multiple
:
var_format
=
op_desc_pb2
.
AttrDesc
()
var_format
=
op_desc_pb2
.
AttrDesc
()
var_format
.
type
=
attr
_typ
e_pb2
.
INTS
var_format
.
type
=
attr
ibut
e_pb2
.
INTS
var_format
.
name
=
"%s_format"
%
in_out
var_format
.
name
=
"%s_format"
%
in_out
var_format
.
ints
.
append
(
0
)
var_format
.
ints
.
append
(
0
)
...
@@ -185,17 +185,17 @@ def get_docstring_from_op_proto(op_proto):
...
@@ -185,17 +185,17 @@ def get_docstring_from_op_proto(op_proto):
for
attr
in
op_proto
.
attrs
:
for
attr
in
op_proto
.
attrs
:
attr_type
=
None
attr_type
=
None
if
attr
.
type
==
attr
_typ
e_pb2
.
INT
:
if
attr
.
type
==
attr
ibut
e_pb2
.
INT
:
attr_type
=
"int"
attr_type
=
"int"
elif
attr
.
type
==
attr
_typ
e_pb2
.
FLOAT
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
FLOAT
:
attr_type
=
"float"
attr_type
=
"float"
elif
attr
.
type
==
attr
_typ
e_pb2
.
STRING
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
STRING
:
attr_type
=
"basestr"
attr_type
=
"basestr"
elif
attr
.
type
==
attr
_typ
e_pb2
.
INTS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
INTS
:
attr_type
=
"list of int"
attr_type
=
"list of int"
elif
attr
.
type
==
attr
_typ
e_pb2
.
FLOATS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
FLOATS
:
attr_type
=
"list of float"
attr_type
=
"list of float"
elif
attr
.
type
==
attr
_typ
e_pb2
.
STRINGS
:
elif
attr
.
type
==
attr
ibut
e_pb2
.
STRINGS
:
attr_type
=
"list of basestr"
attr_type
=
"list of basestr"
if
attr_type
is
None
:
if
attr_type
is
None
:
...
...
python/paddle/v2/framework/tests/CMakeLists.txt
浏览文件 @
59a8ebc6
add_python_test
(
test_framework
py_test
(
test_net SRCS test_net.py
)
test_protobuf.py
test_scope.py
py_test
(
test_fc_op SRCS test_fc_op.py
)
test_default_scope_funcs.py
py_test
(
test_scope SRCS test_scope.py
)
test_op_creation_methods.py
test_net.py
py_test
(
test_tensor SRCS test_tensor.py
)
test_tensor.py
py_test
(
test_mul_op SRCS test_mul_op.py
)
test_fc_op.py
test_add_two_op.py
py_test
(
test_network SRCS test_network.py
)
test_sgd_op.py
py_test
(
test_mean_op SRCS test_mean_op.py
)
test_mul_op.py
test_mean_op.py
py_test
(
test_protobuf SRCS test_protobuf.py
)
test_sigmoid_op.py
test_softmax_op.py
py_test
(
test_add_two_op SRCS test_add_two_op.py
)
test_rowwise_add_op.py
py_test
(
test_sigmoid_op SRCS test_sigmoid_op.py
)
test_network.py
py_test
(
test_softmax_op SRCS test_softmax_op.py
)
gradient_checker.py
)
py_test
(
gradient_checker SRCS gradient_checker.py
)
py_test
(
test_rowwise_add_op SRCS test_rowwise_add_op.py
)
py_test
(
test_default_scope_funcs SRCS test_default_scope_funcs.py
)
py_test
(
test_op_creation_methods SRCS test_op_creation_methods.py
)
python/paddle/v2/framework/tests/op_test_util.py
浏览文件 @
59a8ebc6
...
@@ -33,23 +33,28 @@ class OpTestMeta(type):
...
@@ -33,23 +33,28 @@ class OpTestMeta(type):
for
place
in
places
:
for
place
in
places
:
for
in_name
in
func
.
all_input_args
:
for
in_name
in
func
.
all_input_args
:
if
hasattr
(
self
,
in_name
)
:
if
hasattr
(
self
,
"inputs"
)
and
in_name
in
self
.
inputs
:
kwargs
[
in_name
]
=
in_name
kwargs
[
in_name
]
=
in_name
var
=
scope
.
new_var
(
in_name
).
get_tensor
()
var
=
scope
.
new_var
(
in_name
).
get_tensor
()
arr
=
getattr
(
self
,
in_name
)
arr
=
self
.
inputs
[
in_name
]
var
.
set_dims
(
arr
.
shape
)
var
.
set_dims
(
arr
.
shape
)
var
.
set
(
arr
,
place
)
var
.
set
(
arr
,
place
)
else
:
else
:
kwargs
[
in_name
]
=
"@EMPTY@"
kwargs
[
in_name
]
=
"@EMPTY@"
for
out_name
in
func
.
all_output_args
:
for
out_name
in
func
.
all_output_args
:
if
hasattr
(
self
,
out_name
):
if
not
hasattr
(
self
,
"outputs"
):
kwargs
[
out_name
]
=
out_name
raise
ValueError
(
scope
.
new_var
(
out_name
).
get_tensor
()
"The test op must set self.outputs dict."
)
if
out_name
not
in
self
.
outputs
:
raise
ValueError
(
"The %s is not in self.outputs dict."
%
(
out_name
))
kwargs
[
out_name
]
=
out_name
scope
.
new_var
(
out_name
).
get_tensor
()
for
attr_name
in
func
.
all_attr_args
:
for
attr_name
in
func
.
all_attr_args
:
if
hasattr
(
self
,
attr_name
)
:
if
hasattr
(
self
,
"attrs"
)
and
attr_name
in
self
.
attrs
:
kwargs
[
attr_name
]
=
getattr
(
self
,
attr_name
)
kwargs
[
attr_name
]
=
self
.
attrs
[
attr_name
]
op
=
func
(
**
kwargs
)
op
=
func
(
**
kwargs
)
...
@@ -60,11 +65,8 @@ class OpTestMeta(type):
...
@@ -60,11 +65,8 @@ class OpTestMeta(type):
for
out_name
in
func
.
all_output_args
:
for
out_name
in
func
.
all_output_args
:
actual
=
numpy
.
array
(
scope
.
find_var
(
out_name
).
get_tensor
())
actual
=
numpy
.
array
(
scope
.
find_var
(
out_name
).
get_tensor
())
expect
=
getattr
(
self
,
out_name
)
expect
=
self
.
outputs
[
out_name
]
# TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
numpy
.
isclose
(
actual
,
expect
)
# has some diff, and could not pass unittest. So I set decimal 3 here.
# And I will check this in future.
numpy
.
testing
.
assert_almost_equal
(
actual
,
expect
,
decimal
=
3
)
obj
.
test_all
=
test_all
obj
.
test_all
=
test_all
return
obj
return
obj
python/paddle/v2/framework/tests/test_add_two_op.py
浏览文件 @
59a8ebc6
...
@@ -12,9 +12,11 @@ class TestAddOp(unittest.TestCase):
...
@@ -12,9 +12,11 @@ class TestAddOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"add_two"
self
.
type
=
"add_two"
self
.
X
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
self
.
inputs
=
{
self
.
Y
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
'X'
:
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
),
self
.
Out
=
self
.
X
+
self
.
Y
'Y'
:
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
+
self
.
inputs
[
'Y'
]}
class
TestAddGradOp
(
unittest
.
TestCase
):
class
TestAddGradOp
(
unittest
.
TestCase
):
...
...
python/paddle/v2/framework/tests/test_cross_entropy_op.py
浏览文件 @
59a8ebc6
...
@@ -7,16 +7,20 @@ class TestSGD(unittest.TestCase):
...
@@ -7,16 +7,20 @@ class TestSGD(unittest.TestCase):
__metaclass__
=
OpTestMeta
__metaclass__
=
OpTestMeta
def
setUp
(
self
):
def
setUp
(
self
):
# TODO this unit test is not passed
self
.
type
=
"onehot_cross_entropy"
self
.
type
=
"onehot_cross_entropy"
batch_size
=
100
batch_size
=
100
class_num
=
10
class_num
=
10
self
.
X
=
numpy
.
random
.
random
((
batch_size
,
class_num
)).
astype
(
"float32"
)
X
=
numpy
.
random
.
random
((
batch_size
,
class_num
)).
astype
(
"float32"
)
self
.
label
=
5
*
numpy
.
ones
(
batch_size
).
astype
(
"int32"
)
label
=
5
*
numpy
.
ones
(
batch_size
).
astype
(
"int32"
)
self
.
inputs
=
{
'X'
:
X
,
'label'
:
label
}
Y
=
[]
Y
=
[]
for
i
in
range
(
0
,
batch_size
):
for
i
in
range
(
0
,
batch_size
):
Y
.
append
(
-
numpy
.
log
(
self
.
X
[
i
][
self
.
label
[
i
]]))
Y
.
append
(
-
numpy
.
log
(
X
[
i
][
label
[
i
]]))
self
.
Y
=
numpy
.
array
(
Y
).
astype
(
"float32"
)
self
.
outputs
=
{
'Y'
:
numpy
.
array
(
Y
).
astype
(
"float32"
)}
# TODO(superjom) add gradient check
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/v2/framework/tests/test_mean_op.py
浏览文件 @
59a8ebc6
...
@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase):
...
@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"mean"
self
.
type
=
"mean"
self
.
X
=
np
.
random
.
random
((
32
,
784
)).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
32
,
784
)).
astype
(
"float32"
)}
self
.
Out
=
np
.
mean
(
self
.
X
)
self
.
outputs
=
{
'Out'
:
np
.
mean
(
self
.
inputs
[
'X'
])}
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/v2/framework/tests/test_mul_op.py
浏览文件 @
59a8ebc6
...
@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase):
...
@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"mul"
self
.
type
=
"mul"
self
.
X
=
np
.
random
.
random
((
32
,
84
)).
astype
(
"float32"
)
self
.
inputs
=
{
self
.
Y
=
np
.
random
.
random
((
84
,
100
)).
astype
(
"float32"
)
'X'
:
np
.
random
.
random
((
32
,
84
)).
astype
(
"float32"
),
self
.
Out
=
np
.
dot
(
self
.
X
,
self
.
Y
)
'Y'
:
np
.
random
.
random
((
84
,
100
)).
astype
(
"float32"
)
}
self
.
outputs
=
{
'Out'
:
np
.
dot
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'Y'
])}
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/v2/framework/tests/test_op_creation_methods.py
浏览文件 @
59a8ebc6
...
@@ -3,7 +3,7 @@ import paddle.v2.framework.create_op_creation_methods as creation
...
@@ -3,7 +3,7 @@ import paddle.v2.framework.create_op_creation_methods as creation
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.proto.op_proto_pb2
as
op_proto_pb2
import
paddle.v2.framework.proto.op_proto_pb2
as
op_proto_pb2
import
paddle.v2.framework.proto.op_desc_pb2
as
op_desc_pb2
import
paddle.v2.framework.proto.op_desc_pb2
as
op_desc_pb2
import
paddle.v2.framework.proto.attr
_type_pb2
as
attr_typ
e_pb2
import
paddle.v2.framework.proto.attr
ibute_pb2
as
attribut
e_pb2
class
TestGetAllProtos
(
unittest
.
TestCase
):
class
TestGetAllProtos
(
unittest
.
TestCase
):
...
@@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
...
@@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected1
.
type
=
'fc'
expected1
.
type
=
'fc'
attr
=
expected1
.
attrs
.
add
()
attr
=
expected1
.
attrs
.
add
()
attr
.
name
=
'input_format'
attr
.
name
=
'input_format'
attr
.
type
=
attr
_typ
e_pb2
.
INTS
attr
.
type
=
attr
ibut
e_pb2
.
INTS
attr
.
ints
.
extend
([
0
,
1
,
2
,
3
])
attr
.
ints
.
extend
([
0
,
1
,
2
,
3
])
self
.
assertEqual
(
expected1
,
generated1
)
self
.
assertEqual
(
expected1
,
generated1
)
...
@@ -88,7 +88,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
...
@@ -88,7 +88,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected2
.
type
=
'fc'
expected2
.
type
=
'fc'
attr
=
expected2
.
attrs
.
add
()
attr
=
expected2
.
attrs
.
add
()
attr
.
name
=
'input_format'
attr
.
name
=
'input_format'
attr
.
type
=
attr
_typ
e_pb2
.
INTS
attr
.
type
=
attr
ibut
e_pb2
.
INTS
attr
.
ints
.
extend
([
0
,
3
,
6
,
7
])
attr
.
ints
.
extend
([
0
,
3
,
6
,
7
])
self
.
assertEqual
(
expected2
,
generated2
)
self
.
assertEqual
(
expected2
,
generated2
)
...
@@ -105,12 +105,12 @@ class TestOpDescCreationMethod(unittest.TestCase):
...
@@ -105,12 +105,12 @@ class TestOpDescCreationMethod(unittest.TestCase):
attr
.
comment
=
""
attr
.
comment
=
""
attr
.
type
=
type
attr
.
type
=
type
__add_attr__
(
"int_attr"
,
attr
_typ
e_pb2
.
INT
)
__add_attr__
(
"int_attr"
,
attr
ibut
e_pb2
.
INT
)
__add_attr__
(
"float_attr"
,
attr
_typ
e_pb2
.
FLOAT
)
__add_attr__
(
"float_attr"
,
attr
ibut
e_pb2
.
FLOAT
)
__add_attr__
(
"string_attr"
,
attr
_typ
e_pb2
.
STRING
)
__add_attr__
(
"string_attr"
,
attr
ibut
e_pb2
.
STRING
)
__add_attr__
(
"ints_attr"
,
attr
_typ
e_pb2
.
INTS
)
__add_attr__
(
"ints_attr"
,
attr
ibut
e_pb2
.
INTS
)
__add_attr__
(
"floats_attr"
,
attr
_typ
e_pb2
.
FLOATS
)
__add_attr__
(
"floats_attr"
,
attr
ibut
e_pb2
.
FLOATS
)
__add_attr__
(
"strings_attr"
,
attr
_typ
e_pb2
.
STRINGS
)
__add_attr__
(
"strings_attr"
,
attr
ibut
e_pb2
.
STRINGS
)
op
.
comment
=
""
op
.
comment
=
""
self
.
assertTrue
(
op
.
IsInitialized
())
self
.
assertTrue
(
op
.
IsInitialized
())
...
@@ -131,32 +131,32 @@ class TestOpDescCreationMethod(unittest.TestCase):
...
@@ -131,32 +131,32 @@ class TestOpDescCreationMethod(unittest.TestCase):
expected
.
inputs
.
extend
([
'a'
])
expected
.
inputs
.
extend
([
'a'
])
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
"int_attr"
attr
.
name
=
"int_attr"
attr
.
type
=
attr
_typ
e_pb2
.
INT
attr
.
type
=
attr
ibut
e_pb2
.
INT
attr
.
i
=
10
attr
.
i
=
10
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
"float_attr"
attr
.
name
=
"float_attr"
attr
.
type
=
attr
_typ
e_pb2
.
FLOAT
attr
.
type
=
attr
ibut
e_pb2
.
FLOAT
attr
.
f
=
3.2
attr
.
f
=
3.2
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
"string_attr"
attr
.
name
=
"string_attr"
attr
.
type
=
attr
_typ
e_pb2
.
STRING
attr
.
type
=
attr
ibut
e_pb2
.
STRING
attr
.
s
=
"test_str"
attr
.
s
=
"test_str"
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
"ints_attr"
attr
.
name
=
"ints_attr"
attr
.
type
=
attr
_typ
e_pb2
.
INTS
attr
.
type
=
attr
ibut
e_pb2
.
INTS
attr
.
ints
.
extend
([
0
,
1
,
2
,
3
,
4
])
attr
.
ints
.
extend
([
0
,
1
,
2
,
3
,
4
])
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
'floats_attr'
attr
.
name
=
'floats_attr'
attr
.
type
=
attr
_typ
e_pb2
.
FLOATS
attr
.
type
=
attr
ibut
e_pb2
.
FLOATS
attr
.
floats
.
extend
([
0.2
,
3.2
,
4.5
])
attr
.
floats
.
extend
([
0.2
,
3.2
,
4.5
])
attr
=
expected
.
attrs
.
add
()
attr
=
expected
.
attrs
.
add
()
attr
.
name
=
'strings_attr'
attr
.
name
=
'strings_attr'
attr
.
type
=
attr
_typ
e_pb2
.
STRINGS
attr
.
type
=
attr
ibut
e_pb2
.
STRINGS
attr
.
strings
.
extend
([
'a'
,
'b'
,
'c'
])
attr
.
strings
.
extend
([
'a'
,
'b'
,
'c'
])
self
.
assertEqual
(
expected
,
generated
)
self
.
assertEqual
(
expected
,
generated
)
...
@@ -185,7 +185,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
...
@@ -185,7 +185,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
desc
.
type
=
"test"
desc
.
type
=
"test"
attr
=
desc
.
attrs
.
add
()
attr
=
desc
.
attrs
.
add
()
attr
.
name
=
"temporary_index"
attr
.
name
=
"temporary_index"
attr
.
type
=
attr
_typ
e_pb2
.
INTS
attr
.
type
=
attr
ibut
e_pb2
.
INTS
attr
.
ints
.
append
(
2
)
attr
.
ints
.
append
(
2
)
self
.
assertEqual
(
generated
,
desc
)
self
.
assertEqual
(
generated
,
desc
)
...
@@ -219,7 +219,7 @@ This op is used for unit test, not a real op.
...
@@ -219,7 +219,7 @@ This op is used for unit test, not a real op.
test_str
=
op
.
attrs
.
add
()
test_str
=
op
.
attrs
.
add
()
test_str
.
name
=
"str_attr"
test_str
.
name
=
"str_attr"
test_str
.
type
=
attr
_typ
e_pb2
.
STRING
test_str
.
type
=
attr
ibut
e_pb2
.
STRING
test_str
.
comment
=
"A string attribute for test op"
test_str
.
comment
=
"A string attribute for test op"
actual
=
creation
.
get_docstring_from_op_proto
(
op
)
actual
=
creation
.
get_docstring_from_op_proto
(
op
)
...
...
python/paddle/v2/framework/tests/test_protobuf.py
浏览文件 @
59a8ebc6
import
paddle.v2.framework.proto.op_proto_pb2
import
paddle.v2.framework.proto.op_proto_pb2
as
op_proto_lib
import
paddle.v2.framework.proto.attr
_type_pb2
import
paddle.v2.framework.proto.attr
ibute_pb2
as
attr_type_lib
import
unittest
import
unittest
class
TestFrameworkProto
(
unittest
.
TestCase
):
class
TestFrameworkProto
(
unittest
.
TestCase
):
def
test_all
(
self
):
def
test_all
(
self
):
op_proto_lib
=
paddle
.
v2
.
framework
.
proto
.
op_proto_pb2
attr_type_lib
=
paddle
.
v2
.
framework
.
proto
.
attr_type_pb2
op_proto
=
op_proto_lib
.
OpProto
()
op_proto
=
op_proto_lib
.
OpProto
()
ipt0
=
op_proto
.
inputs
.
add
()
ipt0
=
op_proto
.
inputs
.
add
()
ipt0
.
name
=
"a"
ipt0
.
name
=
"a"
...
...
python/paddle/v2/framework/tests/test_recurrent_op.py
浏览文件 @
59a8ebc6
import
logging
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.core
as
core
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
...
@@ -7,10 +8,9 @@ ops = creation.op_creations
...
@@ -7,10 +8,9 @@ ops = creation.op_creations
def
create_tensor
(
scope
,
name
,
shape
):
def
create_tensor
(
scope
,
name
,
shape
):
tensor
=
scope
.
create
_var
(
name
).
get_tensor
()
tensor
=
scope
.
new
_var
(
name
).
get_tensor
()
tensor
.
set_dims
(
shape
)
tensor
.
set_dims
(
shape
)
tensor
.
alloc_float
()
tensor
.
set
(
np
.
random
.
random
(
shape
),
core
.
CPUPlace
())
tensor
.
set
(
np
.
random
.
random
(
shape
))
return
tensor
return
tensor
...
@@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase):
...
@@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase):
- h
- h
'''
'''
input_dim
=
30
batch_size
=
50
weight_dim
=
15
sent_len
=
11
def
init
(
self
):
def
init
(
self
):
input_dim
=
30
batch_size
=
50
weight_dim
=
15
self
.
scope
=
core
.
Scope
(
None
)
# create vars
create_tensor
(
self
.
scope
,
"x"
,
[
batch_size
,
input_dim
])
create_tensor
(
self
.
scope
,
"W"
,
[
input_dim
,
weight_dim
])
create_tensor
(
self
.
scope
,
"U"
,
[
weight_dim
,
weight_dim
])
create_tensor
(
self
.
scope
,
"h_boot"
,
[
batch_size
,
weight_dim
])
x_alias
=
"x@alias"
y_alias
=
"y@alias"
memory
=
"h@alias"
prememory
=
"h@pre"
output
=
"rnn_out"
output_alias
=
"rnn_out@alias"
# create step net
stepnet_var
=
self
.
scope
.
create_var
(
"stepnet"
)
stepnet
=
stepnet_var
.
get_net
()
# stepnet = core.Net.create()
x_fc_op
=
ops
.
fc
(
X
=
x_alias
,
W
=
"W"
,
Y
=
"Wx"
)
h_fc_op
=
ops
.
fc
(
X
=
prememory
,
W
=
"U"
,
Y
=
"Uh"
)
sum_op
=
ops
.
add_two
(
X
=
"Wx"
,
Y
=
"Uh"
,
Out
=
"sum"
)
sig_op
=
ops
.
sigmoid
(
X
=
"sum"
,
Y
=
memory
)
stepnet
.
add_op
(
x_fc_op
)
stepnet
.
add_op
(
h_fc_op
)
stepnet
.
add_op
(
sum_op
)
stepnet
.
add_op
(
sig_op
)
stepnet
.
complete_add_op
(
True
)
self
.
scope
=
core
.
Scope
()
self
.
create_global_variables
()
self
.
create_step_net
()
rnn_op
=
self
.
create_rnn_op
()
ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
print
'infer_shape'
rnn_op
.
infer_shape
(
self
.
scope
)
rnn_op
.
run
(
self
.
scope
,
ctx
)
def
create_global_variables
(
self
):
# create inlink
create_tensor
(
self
.
scope
,
"x"
,
[
self
.
sent_len
,
self
.
batch_size
,
self
.
input_dim
])
create_tensor
(
self
.
scope
,
"W"
,
[
self
.
input_dim
,
self
.
input_dim
])
create_tensor
(
self
.
scope
,
"U"
,
[
self
.
input_dim
,
self
.
input_dim
])
create_tensor
(
self
.
scope
,
"h_boot"
,
[
self
.
batch_size
,
self
.
input_dim
])
self
.
scope
.
new_var
(
"step_scopes"
)
self
.
scope
.
new_var
(
"h@alias"
)
self
.
scope
.
new_var
(
"h"
)
def
create_rnn_op
(
self
):
# create RNNOp
# create RNNOp
rnnop
=
ops
.
recurrent_op
(
rnnop
=
ops
.
recurrent_op
(
# inputs
# inputs
...
@@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase):
...
@@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase):
boot_memories
=
[
"h_boot"
],
boot_memories
=
[
"h_boot"
],
step_net
=
"stepnet"
,
step_net
=
"stepnet"
,
# outputs
# outputs
outlinks
=
[
output
],
outlinks
=
[
"h"
],
step_scopes
=
"step_scopes"
,
step_scopes
=
"step_scopes"
,
# attributes
# attributes
inlink_alias
=
[
"x@alias"
],
inlink_alias
=
[
"x@alias"
],
outlink_alias
=
[
output_alias
],
outlink_alias
=
[
"h@alias"
],
pre_memories
=
[
prememory
],
pre_memories
=
[
"h@pre"
],
memories
=
[
memory
])
memories
=
[
"h@alias"
])
return
rnnop
def
create_step_net
(
self
):
var
=
self
.
scope
.
new_var
(
"stepnet"
)
stepnet
=
var
.
get_net
()
ctx
=
core
.
DeviceContext
.
cpu_context
()
x_fc_op
=
ops
.
fc
(
X
=
"x@alias"
,
W
=
"W"
,
Y
=
"Wx"
)
rnnop
.
infer_shape
(
self
.
scope
)
h_fc_op
=
ops
.
fc
(
X
=
"h@pre"
,
W
=
"U"
,
Y
=
"Uh"
)
rnnop
.
run
(
self
.
scope
,
ctx
)
sum_op
=
ops
.
add_two
(
X
=
"Wx"
,
Y
=
"Uh"
,
Out
=
"sum"
)
sig_op
=
ops
.
sigmoid
(
X
=
"sum"
,
Y
=
"h@alias"
)
for
op
in
[
x_fc_op
,
h_fc_op
,
sum_op
,
sig_op
]:
stepnet
.
add_op
(
op
)
stepnet
.
complete_add_op
(
True
)
def
test_recurrent
(
self
):
def
test_recurrent
(
self
):
self
.
init
()
self
.
init
()
...
...
python/paddle/v2/framework/tests/test_rowwise_add_op.py
浏览文件 @
59a8ebc6
...
@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase):
...
@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"rowwise_add"
self
.
type
=
"rowwise_add"
self
.
X
=
np
.
random
.
random
((
32
,
84
)).
astype
(
"float32"
)
self
.
inputs
=
{
self
.
b
=
np
.
random
.
random
(
84
).
astype
(
"float32"
)
'X'
:
np
.
random
.
random
((
32
,
84
)).
astype
(
"float32"
),
self
.
Out
=
np
.
add
(
self
.
X
,
self
.
b
)
'b'
:
np
.
random
.
random
(
84
).
astype
(
"float32"
)
}
self
.
outputs
=
{
'Out'
:
np
.
add
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'b'
])}
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/v2/framework/tests/test_sgd_op.py
浏览文件 @
59a8ebc6
...
@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase):
...
@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"sgd"
self
.
type
=
"sgd"
self
.
param
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
w
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
self
.
grad
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
g
=
numpy
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
self
.
learning_rate
=
0.1
lr
=
0.1
self
.
param_out
=
self
.
param
-
self
.
learning_rate
*
self
.
grad
self
.
inputs
=
{
'param'
:
w
,
'grad'
:
g
}
self
.
attrs
=
{
'learning_rate'
:
lr
}
self
.
outputs
=
{
'param_out'
:
w
-
lr
*
g
}
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/v2/framework/tests/test_sigmoid_op.py
浏览文件 @
59a8ebc6
...
@@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase):
...
@@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"sigmoid"
self
.
type
=
"sigmoid"
self
.
X
=
np
.
random
.
random
((
32
,
100
)).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
32
,
100
)).
astype
(
"float32"
)}
self
.
Y
=
1
/
(
1
+
np
.
exp
(
-
self
.
X
))
self
.
outputs
=
{
'Y'
:
1
/
(
1
+
np
.
exp
(
-
self
.
inputs
[
'X'
]))}
#class TestSigmoidGradOp(unittest.TestCase):
#TODO(qingqing) add unit test
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/v2/framework/tests/test_softmax_op.py
浏览文件 @
59a8ebc6
...
@@ -19,8 +19,10 @@ class TestSoftmaxOp(unittest.TestCase):
...
@@ -19,8 +19,10 @@ class TestSoftmaxOp(unittest.TestCase):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
type
=
"softmax"
self
.
type
=
"softmax"
self
.
X
=
np
.
random
.
random
((
32
,
100
)).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
32
,
100
)).
astype
(
"float32"
)}
self
.
Y
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
self
.
X
)
self
.
outputs
=
{
'Y'
:
np
.
apply_along_axis
(
stable_softmax
,
1
,
self
.
inputs
[
'X'
])
}
class
TestSoftmaxGradOp
(
unittest
.
TestCase
):
class
TestSoftmaxGradOp
(
unittest
.
TestCase
):
...
...
python/paddle/v2/plot/tests/CMakeLists.txt
浏览文件 @
59a8ebc6
if
(
NOT APPLE
)
if
(
NOT APPLE
)
# The Mac OS X backend will not be able to function correctly if Python is
# The Mac OS X backend will not be able to function correctly if Python is
# not installed as a framework.
# not installed as a framework.
add_python_test
(
test_ploter
test_ploter.py
)
py_test
(
test_ploter SRCS
test_ploter.py
)
endif
()
endif
()
python/paddle/v2/reader/tests/CMakeLists.txt
浏览文件 @
59a8ebc6
add_python_test
(
reader_tests creator_test.py decorator_test.py
)
py_test
(
creator_test SRCS creator_test.py
)
py_test
(
decorator_test SRCS decorator_test.py
)
python/paddle/v2/tests/CMakeLists.txt
浏览文件 @
59a8ebc6
add_python_test
(
test_v2_api test_data_feeder.py test_op.py test_parameters.py
py_test
(
test_op SRCS test_op.py
)
test_layer.py test_rnn_layer.py test_topology.py test_image.py
)
py_test
(
test_image SRCS test_image.py
)
py_test
(
test_layer SRCS test_layer.py
)
py_test
(
test_topology SRCS test_topology.py
)
py_test
(
test_rnn_layer SRCS test_rnn_layer.py
)
py_test
(
test_parameters SRCS test_parameters.py
)
py_test
(
test_data_feeder SRCS test_data_feeder.py
)
python/setup.py.in
浏览文件 @
59a8ebc6
...
@@ -14,7 +14,7 @@ packages=['paddle',
...
@@ -14,7 +14,7 @@ packages=['paddle',
'paddle.v2.framework.proto']
'paddle.v2.framework.proto']
setup_requires=["requests",
setup_requires=["requests",
"numpy",
"numpy
>=1.12
",
"protobuf==3.1",
"protobuf==3.1",
"recordio",
"recordio",
"matplotlib",
"matplotlib",
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录