Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ef6ea790
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ef6ea790
编写于
5月 04, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Clean and extract blas
上级
d0785ce9
变更
22
显示空白变更内容
内联
并排
Showing
22 changed file
with
398 addition
and
400 deletion
+398
-400
paddle/fluid/operators/bilinear_tensor_product_op.h
paddle/fluid/operators/bilinear_tensor_product_op.h
+1
-1
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+1
-1
paddle/fluid/operators/conv_transpose_op.h
paddle/fluid/operators/conv_transpose_op.h
+1
-1
paddle/fluid/operators/gru_unit_op.h
paddle/fluid/operators/gru_unit_op.h
+2
-3
paddle/fluid/operators/layer_norm_op.h
paddle/fluid/operators/layer_norm_op.h
+7
-7
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+1
-1
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+3
-4
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-1
paddle/fluid/operators/math/blas.cc
paddle/fluid/operators/math/blas.cc
+22
-0
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+152
-0
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+87
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+88
-1
paddle/fluid/operators/math/context_project.h
paddle/fluid/operators/math/context_project.h
+6
-5
paddle/fluid/operators/math/gru_compute.cc
paddle/fluid/operators/math/gru_compute.cc
+1
-1
paddle/fluid/operators/math/gru_compute.cu
paddle/fluid/operators/math/gru_compute.cu
+1
-1
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+0
-127
paddle/fluid/operators/math/math_function.cu
paddle/fluid/operators/math/math_function.cu
+7
-138
paddle/fluid/operators/math/math_function.h
paddle/fluid/operators/math/math_function.h
+0
-93
paddle/fluid/operators/math/math_function_test.cc
paddle/fluid/operators/math/math_function_test.cc
+3
-3
paddle/fluid/operators/math/math_function_test.cu
paddle/fluid/operators/math/math_function_test.cu
+3
-3
paddle/fluid/operators/math/matmul.h
paddle/fluid/operators/math/matmul.h
+8
-7
paddle/fluid/operators/mul_op.h
paddle/fluid/operators/mul_op.h
+2
-2
未找到文件。
paddle/fluid/operators/bilinear_tensor_product_op.h
浏览文件 @
ef6ea790
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
ef6ea790
...
...
@@ -17,9 +17,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace
paddle
{
...
...
paddle/fluid/operators/conv_transpose_op.h
浏览文件 @
ef6ea790
...
...
@@ -16,8 +16,8 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace
paddle
{
...
...
paddle/fluid/operators/gru_unit_op.h
浏览文件 @
ef6ea790
...
...
@@ -14,11 +14,10 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/layer_norm_op.h
浏览文件 @
ef6ea790
...
...
@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
...
...
@@ -46,9 +46,9 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
)
{
math
::
gemv
<
platform
::
CUDADeviceContext
,
T
>
(
context
,
false
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
()
,
0.
,
out
->
data
<
T
>
());
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
).
GEMV
(
false
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
(),
0.
,
out
->
data
<
T
>
());
}
private:
...
...
@@ -93,9 +93,9 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
)
{
math
::
gemv
<
platform
::
CUDADeviceContext
,
T
>
(
context
,
true
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
()
,
0.
,
out
->
data
<
T
>
());
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
).
GEMV
(
true
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
(),
0.
,
out
->
data
<
T
>
());
}
private:
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
ef6ea790
...
...
@@ -15,9 +15,9 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
ef6ea790
...
...
@@ -14,15 +14,14 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
ef6ea790
...
...
@@ -41,7 +41,8 @@ math_library(depthwise_conv)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
im2col
)
math_library
(
lstm_compute DEPS activation_functions
)
math_library
(
math_function DEPS cblas
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto
)
math_library
(
math_function DEPS blas
)
math_library
(
maxouting
)
math_library
(
pooling
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
)
...
...
paddle/fluid/operators/math/blas.cc
0 → 100644
浏览文件 @
ef6ea790
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
// Do nothing. Blas is a header only library.
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/blas.h
0 → 100644
浏览文件 @
ef6ea790
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#include <lapacke.h>
#endif
#ifndef LAPACK_FOUND
extern
"C"
{
#include <cblas.h> // NOLINT
int
LAPACKE_sgetrf
(
int
matrix_layout
,
int
m
,
int
n
,
float
*
a
,
int
lda
,
int
*
ipiv
);
int
LAPACKE_dgetrf
(
int
matrix_layout
,
int
m
,
int
n
,
double
*
a
,
int
lda
,
int
*
ipiv
);
int
LAPACKE_sgetri
(
int
matrix_layout
,
int
n
,
float
*
a
,
int
lda
,
const
int
*
ipiv
);
int
LAPACKE_dgetri
(
int
matrix_layout
,
int
n
,
double
*
a
,
int
lda
,
const
int
*
ipiv
);
}
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
DeviceContext
>
class
Blas
{
public:
explicit
Blas
(
const
DeviceContext
&
context
)
:
context_
(
context
)
{}
template
<
typename
T
>
void
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
framework
::
Tensor
*
mat_out
)
const
{
MatMul
(
mat_a
,
trans_a
,
mat_b
,
trans_b
,
static_cast
<
T
>
(
1.0
),
mat_out
,
static_cast
<
T
>
(
0.0
));
}
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
const
framework
::
Tensor
&
mat_b
,
framework
::
Tensor
*
mat_out
)
const
{
this
->
template
MatMul
<
T
>(
mat_a
,
false
,
mat_b
,
false
,
mat_out
);
}
template
<
typename
T
>
void
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
;
template
<
typename
T
>
void
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
;
private:
const
DeviceContext
&
context_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
BlasT
:
private
Blas
<
DeviceContext
>
{
public:
using
Blas
<
DeviceContext
>::
Blas
;
template
<
typename
...
ARGS
>
void
GEMM
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
MatMul
(
ARGS
...
args
)
const
{
Base
()
->
template
MatMul
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
AXPY
(
ARGS
...
args
)
const
{
Base
()
->
template
AXPY
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMV
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMV
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
BatchedGEMM
(
ARGS
...
args
)
const
{
Base
()
->
template
BatchedGEMM
<
T
>(
args
...);
}
private:
const
Blas
<
DeviceContext
>*
Base
()
const
{
return
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
framework
::
ExecutionContext
&
exe_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
exe_ctx
.
template
device_context
<
DeviceContext
>());
}
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
DeviceContext
&
dev_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
dev_ctx
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
#include "paddle/fluid/operators/math/blas_impl.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/math/blas_impl.cu.h"
#endif
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
ef6ea790
...
...
@@ -30,6 +30,25 @@ struct CUBlas<float> {
static
void
GEMM
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemm
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSaxpy
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemv
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
...
...
@@ -38,6 +57,25 @@ struct CUBlas<double> {
static
void
GEMM
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemm
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDaxpy
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemv
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
...
...
@@ -57,6 +95,15 @@ struct CUBlas<platform::float16> {
reinterpret_cast
<
const
__half
*>
(
beta
),
reinterpret_cast
<
__half
*>
(
C
),
ldc
));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
...
...
@@ -144,6 +191,46 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
CUBlas
<
T
>::
AXPY
(
context_
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
T
>::
GEMV
(
context_
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
CUBlas
<
T
>::
GEMM_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
ef6ea790
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
...
...
@@ -28,6 +28,23 @@ struct CBlas<float> {
static
void
GEMM
(
ARGS
...
args
)
{
cblas_sgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
cblas_saxpy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_sgemv
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
cblas_sgemm_batch
(
args
...);
}
#endif
};
template
<
>
...
...
@@ -36,11 +53,33 @@ struct CBlas<double> {
static
void
GEMM
(
ARGS
...
args
)
{
cblas_dgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
cblas_daxpy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_dgemv
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
cblas_dgemm_batch
(
args
...);
}
#endif
};
template
<
>
struct
CBlas
<
platform
::
float16
>
{
static
void
GEMM
(...)
{
PADDLE_THROW
(
"float16 GEMM not supported on CPU"
);
}
#ifdef PADDLE_WITH_MKLML
static
void
GEMM_BATCH
(...)
{
PADDLE_THROW
(
"float16 GEMM_BATCH not supported on CPU"
);
}
#endif
};
template
<
>
...
...
@@ -93,6 +132,54 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
beta
,
mat_out
->
data
<
T
>
());
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
CBlas
<
T
>::
AXPY
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
CBLAS_TRANSPOSE
transA
=
!
trans_a
?
CblasNoTrans
:
CblasTrans
;
CBlas
<
T
>::
GEMV
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
#ifdef PADDLE_WITH_MKLML
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
T
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
T
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
T
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
CBlas
<
T
>::
GEMM_BATCH
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
#else
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
float
*
Ak
=
&
A
[
k
*
strideA
];
const
float
*
Bk
=
&
B
[
k
*
strideB
];
float
*
Ck
=
&
C
[
k
*
M
*
N
];
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/context_project.h
浏览文件 @
ef6ea790
...
...
@@ -17,8 +17,8 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -211,6 +211,7 @@ class ContextProjectGradFunctor {
int
input_row_begin
,
input_row_end
;
int
sequence_height
,
sequence_width
;
sequence_width
=
in
.
dims
()[
1
];
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
if
(
input_grad
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod_level_0
.
size
())
-
1
;
++
i
)
{
...
...
@@ -262,8 +263,8 @@ class ContextProjectGradFunctor {
Tensor
out_t_sub
=
out_t
.
Slice
(
k
*
context_length
,
k
*
context_length
+
padding_size
);
Tensor
w_sub
=
padding_data
->
Slice
(
k
,
k
+
padding_size
);
axpy
<
DeviceContext
,
T
>
(
context
,
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(),
w_sub
.
data
<
T
>
());
blas
.
AXPY
(
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(
),
w_sub
.
data
<
T
>
());
}
}
if
(
down_pad
>
0
)
{
...
...
@@ -294,8 +295,8 @@ class ContextProjectGradFunctor {
(
down_pad_begin_row
+
t
)
*
context_length
);
Tensor
w_sub
=
padding_data
->
Slice
(
up_pad
+
padding_idx
,
up_pad
+
padding_idx
+
padding_size
);
axpy
<
DeviceContext
,
T
>
(
context
,
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(),
w_sub
.
data
<
T
>
());
blas
.
AXPY
(
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(
),
w_sub
.
data
<
T
>
());
}
}
out_t
.
Resize
({
sequence_height
,
context_length
*
sequence_width
});
...
...
paddle/fluid/operators/math/gru_compute.cc
浏览文件 @
ef6ea790
...
...
@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/gru_compute.cu
浏览文件 @
ef6ea790
...
...
@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
ef6ea790
...
...
@@ -24,133 +24,6 @@ namespace math {
using
float16
=
paddle
::
platform
::
float16
;
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float16
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float16
alpha
,
const
float16
*
A
,
const
float16
*
B
,
const
float16
beta
,
float16
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
PADDLE_THROW
(
"float16 batched_gemm not supported on CPU"
);
}
#ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
float
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
float
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
float
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
cblas_sgemm_batch
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
}
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
double
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
double
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
double
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
cblas_dgemm_batch
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
}
#else
// The below is a naive but correct serial implementation that just loops
// over the batch dimension. This is a fallback for when the batched gemm
// functions of Intel MKL are not available. In the future, this computation
// should be parallelized.
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
float
*
Ak
=
&
A
[
k
*
strideA
];
const
float
*
Bk
=
&
B
[
k
*
strideB
];
float
*
Ck
=
&
C
[
k
*
M
*
N
];
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
}
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
double
*
Ak
=
&
A
[
k
*
strideA
];
const
double
*
Bk
=
&
B
[
k
*
strideB
];
double
*
Ck
=
&
C
[
k
*
M
*
N
];
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
}
#endif
template
<
>
void
gemv
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
)
{
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
cblas_sgemv
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
void
gemv
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
)
{
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
cblas_dgemv
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
void
axpy
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
void
axpy
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
int
n
,
const
double
alpha
,
const
double
*
x
,
double
*
y
)
{
cblas_daxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
platform
::
float16
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
double
>;
...
...
paddle/fluid/operators/math/math_function.cu
浏览文件 @
ef6ea790
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
...
...
@@ -25,136 +26,6 @@ namespace math {
using
float16
=
paddle
::
platform
::
float16
;
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
float16
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float16
alpha
,
const
float16
*
A
,
const
float16
*
B
,
const
float16
beta
,
float16
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
const
half
h_alpha
=
static_cast
<
const
half
>
(
alpha
);
const
half
h_beta
=
static_cast
<
const
half
>
(
beta
);
const
half
*
h_A
=
reinterpret_cast
<
const
half
*>
(
A
);
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context
.
GetComputeCapability
(),
53
,
"cublas Hgemm requires GPU compute capability >= 53"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
strideB
,
h_A
,
lda
,
strideA
,
&
h_beta
,
h_C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
gemv
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
)
{
cublasOperation_t
cuTransA
=
(
trans_a
==
false
)
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemv
(
context
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
));
}
template
<
>
void
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
)
{
cublasOperation_t
cuTransA
=
(
trans_a
==
false
)
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemv
(
context
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
));
}
template
<
>
void
axpy
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSaxpy
(
context
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
));
}
template
<
>
void
axpy
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
int
n
,
const
double
alpha
,
const
double
*
x
,
double
*
y
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDaxpy
(
context
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
));
}
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
double
>;
...
...
@@ -246,10 +117,9 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
one
.
mutable_data
<
double
>
({
in_dims
[
0
]},
context
.
GetPlace
());
SetConstant
<
platform
::
CUDADeviceContext
,
double
>
set
;
set
(
context
,
&
one
,
static_cast
<
double
>
(
1.0
));
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
context
,
true
,
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
1.0
,
input
.
data
<
double
>
(),
one
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
GetBlas
<
platform
::
CUDADeviceContext
,
double
>
(
context
).
GEMV
(
true
,
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
1.0
,
input
.
data
<
double
>
(),
one
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
}
template
struct
RowwiseSum
<
platform
::
CUDADeviceContext
,
float
>;
...
...
@@ -268,10 +138,9 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
one
.
mutable_data
<
double
>
({
size
},
context
.
GetPlace
());
SetConstant
<
platform
::
CUDADeviceContext
,
double
>
set
;
set
(
context
,
&
one
,
static_cast
<
double
>
(
1.0
));
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
context
,
true
,
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
0
]),
1.0
,
one
.
data
<
double
>
(),
input
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
GetBlas
<
platform
::
CUDADeviceContext
,
double
>
(
context
).
GEMV
(
true
,
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
0
]),
1.0
,
one
.
data
<
double
>
(),
input
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
}
template
struct
RowwiseMean
<
platform
::
CUDADeviceContext
,
float
>;
...
...
paddle/fluid/operators/math/math_function.h
浏览文件 @
ef6ea790
...
...
@@ -51,94 +51,6 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
namespace
paddle
{
namespace
operators
{
namespace
math
{
// Support continuous memory now
// If transA = N, and transB = N
// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
// For more detailed info, please refer to
// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
template
<
typename
DeviceContext
>
class
Blas
{
public:
explicit
Blas
(
const
DeviceContext
&
context
)
:
context_
(
context
)
{}
template
<
typename
T
>
void
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
framework
::
Tensor
*
mat_out
)
const
{
MatMul
(
mat_a
,
trans_a
,
mat_b
,
trans_b
,
static_cast
<
T
>
(
1.0
),
mat_out
,
static_cast
<
T
>
(
0.0
));
}
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
const
framework
::
Tensor
&
mat_b
,
framework
::
Tensor
*
mat_out
)
const
{
this
->
template
MatMul
<
T
>(
mat_a
,
false
,
mat_b
,
false
,
mat_out
);
}
private:
const
DeviceContext
&
context_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
BlasT
:
private
Blas
<
DeviceContext
>
{
public:
using
Blas
<
DeviceContext
>::
Blas
;
template
<
typename
...
ARGS
>
void
GEMM
(
ARGS
...
args
)
const
{
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
)
->
template
GEMM
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
MatMul
(
ARGS
...
args
)
const
{
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
)
->
template
MatMul
<
T
>(
args
...);
}
};
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
framework
::
ExecutionContext
&
exe_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
exe_ctx
.
template
device_context
<
DeviceContext
>());
}
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
DeviceContext
&
dev_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
dev_ctx
);
}
// Batched gemm
template
<
typename
DeviceContext
,
typename
T
>
void
batched_gemm
(
const
DeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
);
template
<
typename
DeviceContext
,
typename
T
>
void
gemv
(
const
DeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
);
template
<
typename
DeviceContext
,
typename
T
>
void
axpy
(
const
DeviceContext
&
context
,
const
int
n
,
const
T
alpha
,
const
T
*
x
,
T
*
y
);
template
<
typename
DeviceContext
,
typename
T
,
int
Rank
>
struct
Transpose
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
in
,
...
...
@@ -185,8 +97,3 @@ struct RowwiseMean {
}
// namespace math
}
// namespace operators
}
// namespace paddle
#include "paddle/fluid/operators/math/blas_impl.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/math/blas_impl.cu.h"
#endif
paddle/fluid/operators/math/math_function_test.cc
浏览文件 @
ef6ea790
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/operators/math/math_function.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
template
<
typename
T
>
inline
paddle
::
operators
::
math
::
BlasT
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
...
...
@@ -129,9 +130,8 @@ void GemvTest(int m, int n, bool trans) {
}
paddle
::
platform
::
CPUDeviceContext
context
(
*
cpu_place
);
paddle
::
operators
::
math
::
gemv
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
context
,
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
data_a
,
data_b
,
0.
,
data_c
);
GetBlas
<
T
>
(
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
data_a
,
data_b
,
0.
,
data_c
);
if
(
!
trans
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
...
...
paddle/fluid/operators/math/math_function_test.cu
浏览文件 @
ef6ea790
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -434,9 +435,8 @@ void GemvTest(int m, int n, bool trans) {
paddle
::
framework
::
TensorCopySync
(
mat_a
,
gpu_place
,
&
g_mat_a
);
paddle
::
framework
::
TensorCopySync
(
vec_b
,
gpu_place
,
&
g_vec_b
);
paddle
::
operators
::
math
::
gemv
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
context
,
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
GetBlas
<
T
>
(
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
paddle
::
framework
::
TensorCopySync
(
g_vec_c
,
cpu_place
,
&
vec_c
);
...
...
paddle/fluid/operators/math/matmul.h
浏览文件 @
ef6ea790
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -129,16 +129,17 @@ class MatMulFunctor {
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
auto
blas
=
GetBlas
<
DeviceContext
,
T
>
(
context
);
if
(
!
batchCount
)
{
// regular matrix multiplication
Blas
<
DeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
blas
.
GEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
out
->
data
<
T
>
());
}
else
{
// batched matrix multiplication
b
atched_gemm
<
DeviceContext
,
T
>
(
context
,
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
()
,
beta
,
out
->
data
<
T
>
(),
batchCount
,
strideA
,
strideB
);
b
las
.
BatchedGEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
out
->
data
<
T
>
(),
batchCount
,
strideA
,
strideB
);
}
}
};
...
...
paddle/fluid/operators/mul_op.h
浏览文件 @
ef6ea790
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录