Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
0285a2b9
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0285a2b9
编写于
5月 04, 2018
作者:
Y
Yu Yang
提交者:
GitHub
5月 04, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #10371 from reyoung/refine_code
Polish MatMul, clean copy & paste code
上级
67f42cc1
ef6ea790
变更
23
隐藏空白更改
内联
并排
Showing
23 changed file
with
526 addition
and
671 deletion
+526
-671
paddle/fluid/operators/bilinear_tensor_product_op.h
paddle/fluid/operators/bilinear_tensor_product_op.h
+1
-1
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+7
-9
paddle/fluid/operators/conv_transpose_op.h
paddle/fluid/operators/conv_transpose_op.h
+6
-10
paddle/fluid/operators/gru_unit_op.h
paddle/fluid/operators/gru_unit_op.h
+2
-3
paddle/fluid/operators/layer_norm_op.h
paddle/fluid/operators/layer_norm_op.h
+7
-7
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+15
-19
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+29
-41
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-1
paddle/fluid/operators/math/blas.cc
paddle/fluid/operators/math/blas.cc
+22
-0
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+152
-0
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+101
-16
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+122
-11
paddle/fluid/operators/math/context_project.h
paddle/fluid/operators/math/context_project.h
+6
-5
paddle/fluid/operators/math/gru_compute.cc
paddle/fluid/operators/math/gru_compute.cc
+1
-1
paddle/fluid/operators/math/gru_compute.cu
paddle/fluid/operators/math/gru_compute.cu
+1
-1
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+0
-194
paddle/fluid/operators/math/math_function.cu
paddle/fluid/operators/math/math_function.cu
+7
-225
paddle/fluid/operators/math/math_function.h
paddle/fluid/operators/math/math_function.h
+0
-77
paddle/fluid/operators/math/math_function_test.cc
paddle/fluid/operators/math/math_function_test.cc
+3
-3
paddle/fluid/operators/math/math_function_test.cu
paddle/fluid/operators/math/math_function_test.cu
+20
-23
paddle/fluid/operators/math/matmul.h
paddle/fluid/operators/math/matmul.h
+8
-7
paddle/fluid/operators/mul_op.h
paddle/fluid/operators/mul_op.h
+9
-9
paddle/fluid/operators/sequence_conv_op.h
paddle/fluid/operators/sequence_conv_op.h
+5
-8
未找到文件。
paddle/fluid/operators/bilinear_tensor_product_op.h
浏览文件 @
0285a2b9
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
0285a2b9
...
...
@@ -17,9 +17,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace
paddle
{
...
...
@@ -161,6 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
DeviceContext
,
T
>
im2col
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
...
...
@@ -186,8 +187,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
filter_slice
,
false
,
col_matrix
,
false
,
T
(
1.0
),
&
out_slice
,
T
(
0.0
));
blas
.
MatMul
(
filter_slice
,
col_matrix
,
&
out_slice
);
}
}
}
...
...
@@ -274,6 +274,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
...
...
@@ -303,9 +304,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
col_matrix
.
ShareDataWith
(
in_grad_slice
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
filter_slice
,
true
,
out_grad_slice
,
false
,
T
(
1.0
),
&
col_matrix
,
T
(
0.0
));
blas
.
MatMul
(
filter_slice
,
true
,
out_grad_slice
,
false
,
&
col_matrix
);
if
(
is_expand
&&
data_dim
==
2U
)
{
col2im
(
dev_ctx
,
col
,
dilations
,
strides
,
...
...
@@ -352,9 +351,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
// gemm
Tensor
filter_grad_slice
=
filter_grad_
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
out_grad_slice
,
false
,
col_matrix
,
true
,
T
(
1.0
),
&
filter_grad_slice
,
T
(
1.0
));
blas
.
MatMul
(
out_grad_slice
,
false
,
col_matrix
,
true
,
&
filter_grad_slice
);
}
}
}
...
...
paddle/fluid/operators/conv_transpose_op.h
浏览文件 @
0285a2b9
...
...
@@ -16,8 +16,8 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace
paddle
{
...
...
@@ -118,6 +118,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
set_zero
(
dev_ctx
,
output
,
static_cast
<
T
>
(
0
));
math
::
Col2ImFunctor
<
math
::
ColFormat
::
kCFO
,
DeviceContext
,
T
>
col2im
;
...
...
@@ -134,9 +135,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
// col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
filter
,
true
,
input_batch
,
false
,
static_cast
<
T
>
(
1.0
),
&
col_matrix
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
filter
,
true
,
input_batch
,
false
,
&
col_matrix
);
if
(
data_dim
==
2U
)
{
// col2im: col_matrix -> dy
...
...
@@ -213,6 +212,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// im2col + gemm (similar to conv-forward)
// input need to compute gradient
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
if
(
input_grad
||
filter_grad
)
{
Tensor
col
;
col
.
mutable_data
<
T
>
(
col_shape
,
context
.
GetPlace
());
...
...
@@ -267,9 +267,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or
// (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
// d, h, w)
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
filter
,
false
,
col_matrix
,
false
,
static_cast
<
T
>
(
1.0
),
&
input_grad_batch
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
filter
,
false
,
col_matrix
,
false
,
&
input_grad_batch
);
}
if
(
filter_grad
)
{
// input batch
...
...
@@ -279,9 +277,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
// k_h * k_w)
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
in_batch
,
false
,
col_matrix
,
true
,
static_cast
<
T
>
(
1.0
),
&
filter_grad_
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
in_batch
,
false
,
col_matrix
,
true
,
&
filter_grad_
);
}
}
}
...
...
paddle/fluid/operators/gru_unit_op.h
浏览文件 @
0285a2b9
...
...
@@ -14,11 +14,10 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/layer_norm_op.h
浏览文件 @
0285a2b9
...
...
@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
...
...
@@ -46,9 +46,9 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
)
{
math
::
gemv
<
platform
::
CUDADeviceContext
,
T
>
(
context
,
false
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
()
,
0.
,
out
->
data
<
T
>
());
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
).
GEMV
(
false
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
(),
0.
,
out
->
data
<
T
>
());
}
private:
...
...
@@ -93,9 +93,9 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
)
{
math
::
gemv
<
platform
::
CUDADeviceContext
,
T
>
(
context
,
true
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
()
,
0.
,
out
->
data
<
T
>
());
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
).
GEMV
(
true
,
left_
,
right_
,
1.
,
input
.
data
<
T
>
(),
divisor_
.
data
<
T
>
(),
0.
,
out
->
data
<
T
>
());
}
private:
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
0285a2b9
...
...
@@ -15,9 +15,9 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
...
...
@@ -114,6 +114,7 @@ class LSTMKernel : public framework::OpKernel<T> {
auto
cand_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
));
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
device_ctx
);
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
...
...
@@ -129,9 +130,8 @@ class LSTMKernel : public framework::OpKernel<T> {
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_hidden_t
=
batch_hidden
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
pre_hidden_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
pre_hidden_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
else
if
(
hidden_t0
)
{
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
...
...
@@ -143,9 +143,8 @@ class LSTMKernel : public framework::OpKernel<T> {
Tensor
ordered_h0
;
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
ordered_h0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
ordered_h0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
lstm_value
.
gate_value
=
gate_t
.
data
<
T
>
();
...
...
@@ -282,6 +281,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
device_ctx
);
for
(
int
n
=
static_cast
<
int
>
(
num_batch
)
-
1
;
n
>=
0
;
n
--
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
...
...
@@ -320,29 +320,25 @@ class LSTMGradKernel : public framework::OpKernel<T> {
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_hidden_g
=
batch_hidden_g
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
pre_hidden_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
pre_hidden_g
,
static_cast
<
T
>
(
1.0
));
if
(
weight_g
)
{
/* backward weight */
auto
pre_hidden
=
batch_hidden
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
pre_hidden
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
pre_hidden
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
else
{
if
(
h0
&&
weight_g
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
h0
,
order
,
&
ordered_h0
,
true
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
ordered_h0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
ordered_h0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
}
if
(
h0
&&
h0_g
)
{
ordered_h0_g
.
mutable_data
<
T
>
(
h0_g
->
dims
(),
ctx
.
GetPlace
());
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
}
}
}
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
0285a2b9
...
...
@@ -14,15 +14,14 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -143,7 +142,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto
proj_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"proj_activation"
));
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
device_ctx
);
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
...
...
@@ -160,9 +159,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_proj_t
=
batch_proj
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
pre_proj_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
pre_proj_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
else
if
(
hidden_t0
)
{
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
...
...
@@ -176,16 +174,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
ordered_proj0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
ordered_h0
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
ordered_proj0
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
ordered_h0
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
ordered_proj0
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
ActCompute
(
cell_act
,
place
,
proj0_dev
,
proj0_dev
);
}
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
*
ordered_proj0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
*
ordered_proj0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
lstmp_value
.
gate_value
=
gate_t
.
data
<
T
>
();
...
...
@@ -196,9 +192,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
device_ctx
,
lstmp_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
lstmp_value
.
prev_state_value
=
lstmp_value
.
state_value
;
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
hidden_t
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
proj_t
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
hidden_t
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
proj_t
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj_t_dev
=
EigenMatrix
<
T
>::
From
(
proj_t
);
ActCompute
(
cell_act
,
place
,
proj_t_dev
,
proj_t_dev
);
...
...
@@ -361,6 +356,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
device_ctx
);
for
(
int
n
=
static_cast
<
int
>
(
num_batch
)
-
1
;
n
>=
0
;
n
--
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
...
...
@@ -375,15 +371,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
}
/* hidden state backwarad */
Tensor
out_g
=
batch_hidden_g
.
Slice
(
bstart
,
bend
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
proj_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
out_g
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
proj_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
out_g
,
static_cast
<
T
>
(
0.0
));
/* projection weight backward*/
if
(
proj_weight_g
)
{
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
hidden_t
,
true
,
proj_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
hidden_t
,
true
,
proj_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
}
Tensor
gate
=
batch_gate
->
Slice
(
bstart
,
bend
);
...
...
@@ -419,24 +413,21 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_proj_g
=
batch_proj_g
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
pre_proj_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
pre_proj_g
,
static_cast
<
T
>
(
1.0
));
if
(
weight_g
)
{
/* weight backward*/
auto
pre_proj
=
batch_proj
.
Slice
(
pre_h_start
,
pre_h_end
);
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
pre_proj
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
pre_proj
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
else
{
if
(
h0
&&
weight_g
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
h0
,
order
,
&
ordered_h0
,
true
);
if
(
weight_g
)
{
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
*
ordered_proj0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
*
ordered_proj0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
if
(
h0
&&
(
h0_g
||
proj_weight_g
))
{
...
...
@@ -444,9 +435,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor
proj0_g
;
proj0_g
.
Resize
({
in_dims
[
0
],
proj_weight
->
dims
()[
1
]});
proj0_g
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
proj0_g
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
proj0_g
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
auto
proj0_g_dev
=
EigenMatrix
<
T
>::
From
(
proj0_g
);
...
...
@@ -454,14 +444,12 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
proj0_g_dev
);
}
if
(
h0_g
)
{
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
proj0_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
proj0_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
}
if
(
proj_weight_g
)
{
math
::
matmul
<
DeviceContext
,
T
>
(
device_ctx
,
ordered_h0
,
true
,
proj0_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
ordered_h0
,
true
,
proj0_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
0285a2b9
...
...
@@ -41,7 +41,8 @@ math_library(depthwise_conv)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
im2col
)
math_library
(
lstm_compute DEPS activation_functions
)
math_library
(
math_function DEPS cblas
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto
)
math_library
(
math_function DEPS blas
)
math_library
(
maxouting
)
math_library
(
pooling
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
)
...
...
paddle/fluid/operators/math/blas.cc
0 → 100644
浏览文件 @
0285a2b9
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
// Do nothing. Blas is a header only library.
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/blas.h
0 → 100644
浏览文件 @
0285a2b9
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#include <lapacke.h>
#endif
#ifndef LAPACK_FOUND
extern
"C"
{
#include <cblas.h> // NOLINT
int
LAPACKE_sgetrf
(
int
matrix_layout
,
int
m
,
int
n
,
float
*
a
,
int
lda
,
int
*
ipiv
);
int
LAPACKE_dgetrf
(
int
matrix_layout
,
int
m
,
int
n
,
double
*
a
,
int
lda
,
int
*
ipiv
);
int
LAPACKE_sgetri
(
int
matrix_layout
,
int
n
,
float
*
a
,
int
lda
,
const
int
*
ipiv
);
int
LAPACKE_dgetri
(
int
matrix_layout
,
int
n
,
double
*
a
,
int
lda
,
const
int
*
ipiv
);
}
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
DeviceContext
>
class
Blas
{
public:
explicit
Blas
(
const
DeviceContext
&
context
)
:
context_
(
context
)
{}
template
<
typename
T
>
void
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
;
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
framework
::
Tensor
*
mat_out
)
const
{
MatMul
(
mat_a
,
trans_a
,
mat_b
,
trans_b
,
static_cast
<
T
>
(
1.0
),
mat_out
,
static_cast
<
T
>
(
0.0
));
}
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
const
framework
::
Tensor
&
mat_b
,
framework
::
Tensor
*
mat_out
)
const
{
this
->
template
MatMul
<
T
>(
mat_a
,
false
,
mat_b
,
false
,
mat_out
);
}
template
<
typename
T
>
void
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
;
template
<
typename
T
>
void
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
;
private:
const
DeviceContext
&
context_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
BlasT
:
private
Blas
<
DeviceContext
>
{
public:
using
Blas
<
DeviceContext
>::
Blas
;
template
<
typename
...
ARGS
>
void
GEMM
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
MatMul
(
ARGS
...
args
)
const
{
Base
()
->
template
MatMul
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
AXPY
(
ARGS
...
args
)
const
{
Base
()
->
template
AXPY
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMV
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMV
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
BatchedGEMM
(
ARGS
...
args
)
const
{
Base
()
->
template
BatchedGEMM
<
T
>(
args
...);
}
private:
const
Blas
<
DeviceContext
>*
Base
()
const
{
return
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
framework
::
ExecutionContext
&
exe_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
exe_ctx
.
template
device_context
<
DeviceContext
>());
}
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
DeviceContext
&
dev_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
dev_ctx
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
#include "paddle/fluid/operators/math/blas_impl.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/math/blas_impl.cu.h"
#endif
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
0285a2b9
...
...
@@ -30,6 +30,25 @@ struct CUBlas<float> {
static
void
GEMM
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemm
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSaxpy
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemv
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
...
...
@@ -38,6 +57,25 @@ struct CUBlas<double> {
static
void
GEMM
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemm
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDaxpy
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemv
(
args
...));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
...
...
@@ -57,16 +95,23 @@ struct CUBlas<platform::float16> {
reinterpret_cast
<
const
__half
*>
(
beta
),
reinterpret_cast
<
__half
*>
(
C
),
ldc
));
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
};
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
)
const
{
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -83,10 +128,10 @@ void Blas<platform::CUDADeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
const
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -134,18 +179,58 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
const
bool
transA
,
const
bool
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
const
T
beta
,
T
*
C
,
const
int
ldc
)
const
{
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
==
false
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
transB
==
false
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
CUBlas
<
T
>::
AXPY
(
context_
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
T
>::
GEMV
(
context_
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
CUBlas
<
T
>::
GEMM_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
0285a2b9
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
...
...
@@ -28,6 +28,23 @@ struct CBlas<float> {
static
void
GEMM
(
ARGS
...
args
)
{
cblas_sgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
cblas_saxpy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_sgemv
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
cblas_sgemm_batch
(
args
...);
}
#endif
};
template
<
>
...
...
@@ -36,21 +53,41 @@ struct CBlas<double> {
static
void
GEMM
(
ARGS
...
args
)
{
cblas_dgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
cblas_daxpy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_dgemv
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
cblas_dgemm_batch
(
args
...);
}
#endif
};
template
<
>
struct
CBlas
<
platform
::
float16
>
{
static
void
GEMM
(...)
{
PADDLE_THROW
(
"float16 GEMM not supported on CPU"
);
}
#ifdef PADDLE_WITH_MKLML
static
void
GEMM_BATCH
(...)
{
PADDLE_THROW
(
"float16 GEMM_BATCH not supported on CPU"
);
}
#endif
};
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
)
const
{
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
...
...
@@ -60,15 +97,89 @@ void Blas<platform::CPUDeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
const
bool
transA
,
const
bool
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
const
T
beta
,
T
*
C
,
const
int
ldc
)
const
{
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
CBlas
<
T
>::
GEMM
(
CblasRowMajor
,
transA
==
false
?
CblasNoTrans
:
CblasTrans
,
transB
==
false
?
CblasNoTrans
:
CblasTrans
,
M
,
N
,
K
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
typename
DeviceContext
>
template
<
typename
T
>
void
Blas
<
DeviceContext
>::
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
{
auto
dim_a
=
mat_a
.
dims
();
auto
dim_b
=
mat_b
.
dims
();
auto
dim_out
=
mat_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
mat_a
.
place
()
==
mat_b
.
place
()
&&
mat_a
.
place
()
==
mat_out
->
place
(),
"The places of matrices must be same"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
!
trans_a
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
!
trans_a
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
!
trans_b
?
CblasNoTrans
:
CblasTrans
;
this
->
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
mat_a
.
data
<
T
>
(),
mat_b
.
data
<
T
>
(),
beta
,
mat_out
->
data
<
T
>
());
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
CBlas
<
T
>::
AXPY
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
CBLAS_TRANSPOSE
transA
=
!
trans_a
?
CblasNoTrans
:
CblasTrans
;
CBlas
<
T
>::
GEMV
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
#ifdef PADDLE_WITH_MKLML
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
T
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
T
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
T
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
CBlas
<
T
>::
GEMM_BATCH
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
#else
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
float
*
Ak
=
&
A
[
k
*
strideA
];
const
float
*
Bk
=
&
B
[
k
*
strideB
];
float
*
Ck
=
&
C
[
k
*
M
*
N
];
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/context_project.h
浏览文件 @
0285a2b9
...
...
@@ -17,8 +17,8 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -211,6 +211,7 @@ class ContextProjectGradFunctor {
int
input_row_begin
,
input_row_end
;
int
sequence_height
,
sequence_width
;
sequence_width
=
in
.
dims
()[
1
];
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
if
(
input_grad
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod_level_0
.
size
())
-
1
;
++
i
)
{
...
...
@@ -262,8 +263,8 @@ class ContextProjectGradFunctor {
Tensor
out_t_sub
=
out_t
.
Slice
(
k
*
context_length
,
k
*
context_length
+
padding_size
);
Tensor
w_sub
=
padding_data
->
Slice
(
k
,
k
+
padding_size
);
axpy
<
DeviceContext
,
T
>
(
context
,
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(),
w_sub
.
data
<
T
>
());
blas
.
AXPY
(
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(
),
w_sub
.
data
<
T
>
());
}
}
if
(
down_pad
>
0
)
{
...
...
@@ -294,8 +295,8 @@ class ContextProjectGradFunctor {
(
down_pad_begin_row
+
t
)
*
context_length
);
Tensor
w_sub
=
padding_data
->
Slice
(
up_pad
+
padding_idx
,
up_pad
+
padding_idx
+
padding_size
);
axpy
<
DeviceContext
,
T
>
(
context
,
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(),
w_sub
.
data
<
T
>
());
blas
.
AXPY
(
w_sub
.
numel
(),
static_cast
<
T
>
(
1
),
out_t_sub
.
data
<
T
>
(
),
w_sub
.
data
<
T
>
());
}
}
out_t
.
Resize
({
sequence_height
,
context_length
*
sequence_width
});
...
...
paddle/fluid/operators/math/gru_compute.cc
浏览文件 @
0285a2b9
...
...
@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/gru_compute.cu
浏览文件 @
0285a2b9
...
...
@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
0285a2b9
...
...
@@ -24,200 +24,6 @@ namespace math {
using
float16
=
paddle
::
platform
::
float16
;
template
<
>
void
matmul
<
platform
::
CPUDeviceContext
,
float16
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float16
alpha
,
framework
::
Tensor
*
matrix_out
,
float16
beta
)
{
PADDLE_THROW
(
"float16 matmul not supported on CPU"
);
}
template
<
>
void
matmul
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
matrix_a
.
place
())
&&
platform
::
is_cpu_place
(
matrix_b
.
place
())
&&
platform
::
is_cpu_place
(
matrix_out
->
place
()),
"Matrix must all be in CPUPlace"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
matrix_b
.
data
<
float
>
(),
beta
,
matrix_out
->
data
<
float
>
());
}
template
<
>
void
matmul
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
double
alpha
,
framework
::
Tensor
*
matrix_out
,
double
beta
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
matrix_a
.
place
())
&&
platform
::
is_cpu_place
(
matrix_b
.
place
())
&&
platform
::
is_cpu_place
(
matrix_out
->
place
()),
"Matrix must all be in CPUPlace"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
double
>
(),
matrix_b
.
data
<
double
>
(),
beta
,
matrix_out
->
data
<
double
>
());
}
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float16
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float16
alpha
,
const
float16
*
A
,
const
float16
*
B
,
const
float16
beta
,
float16
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
PADDLE_THROW
(
"float16 batched_gemm not supported on CPU"
);
}
#ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
float
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
float
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
float
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
cblas_sgemm_batch
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
}
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
auto
a_array
=
std
::
vector
<
const
double
*>
(
batchCount
);
auto
b_array
=
std
::
vector
<
const
double
*>
(
batchCount
);
auto
c_array
=
std
::
vector
<
double
*>
(
batchCount
);
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
a_array
[
k
]
=
&
A
[
k
*
strideA
];
b_array
[
k
]
=
&
B
[
k
*
strideB
];
c_array
[
k
]
=
&
C
[
k
*
M
*
N
];
}
cblas_dgemm_batch
(
CblasRowMajor
,
&
transA
,
&
transB
,
&
M
,
&
N
,
&
K
,
&
alpha
,
a_array
.
data
(),
&
lda
,
b_array
.
data
(),
&
ldb
,
&
beta
,
c_array
.
data
(),
&
ldc
,
1
/* group_count */
,
&
batchCount
);
}
#else
// The below is a naive but correct serial implementation that just loops
// over the batch dimension. This is a fallback for when the batched gemm
// functions of Intel MKL are not available. In the future, this computation
// should be parallelized.
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
float
*
Ak
=
&
A
[
k
*
strideA
];
const
float
*
Bk
=
&
B
[
k
*
strideB
];
float
*
Ck
=
&
C
[
k
*
M
*
N
];
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
}
template
<
>
void
batched_gemm
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
const
double
*
Ak
=
&
A
[
k
*
strideA
];
const
double
*
Bk
=
&
B
[
k
*
strideB
];
double
*
Ck
=
&
C
[
k
*
M
*
N
];
Blas
<
platform
::
CPUDeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
Ak
,
Bk
,
beta
,
Ck
);
}
}
#endif
template
<
>
void
gemv
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
)
{
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
cblas_sgemv
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
void
gemv
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
)
{
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
cblas_dgemv
(
CblasRowMajor
,
transA
,
M
,
N
,
alpha
,
A
,
N
,
B
,
1
,
beta
,
C
,
1
);
}
template
<
>
void
axpy
<
platform
::
CPUDeviceContext
,
float
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
cblas_saxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
void
axpy
<
platform
::
CPUDeviceContext
,
double
>
(
const
platform
::
CPUDeviceContext
&
context
,
const
int
n
,
const
double
alpha
,
const
double
*
x
,
double
*
y
)
{
cblas_daxpy
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
platform
::
float16
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
double
>;
...
...
paddle/fluid/operators/math/math_function.cu
浏览文件 @
0285a2b9
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
...
...
@@ -25,223 +26,6 @@ namespace math {
using
float16
=
paddle
::
platform
::
float16
;
template
<
>
void
matmul
<
platform
::
CUDADeviceContext
,
float16
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float16
alpha
,
framework
::
Tensor
*
matrix_out
,
float16
beta
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
matrix_a
.
place
())
&&
platform
::
is_gpu_place
(
matrix_b
.
place
())
&&
platform
::
is_gpu_place
(
matrix_out
->
place
()),
"Matrix must all be in CUDAPlace"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
Blas
<
platform
::
CUDADeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float16
>
(),
matrix_b
.
data
<
float16
>
(),
beta
,
matrix_out
->
data
<
float16
>
());
}
template
<
>
void
matmul
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
matrix_a
.
place
())
&&
platform
::
is_gpu_place
(
matrix_b
.
place
())
&&
platform
::
is_gpu_place
(
matrix_out
->
place
()),
"Matrix must all be in CUDAPlace"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
Blas
<
platform
::
CUDADeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
matrix_b
.
data
<
float
>
(),
beta
,
matrix_out
->
data
<
float
>
());
}
template
<
>
void
matmul
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
double
alpha
,
framework
::
Tensor
*
matrix_out
,
double
beta
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of matmul be matrix"
);
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
matrix_a
.
place
())
&&
platform
::
is_gpu_place
(
matrix_b
.
place
())
&&
platform
::
is_gpu_place
(
matrix_out
->
place
()),
"Matrix must all be in CUDAPlace"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
trans_a
==
false
)
?
dim_a
[
1
]
:
dim_a
[
0
];
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
Blas
<
platform
::
CUDADeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
double
>
(),
matrix_b
.
data
<
double
>
(),
beta
,
matrix_out
->
data
<
double
>
());
}
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
float16
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float16
alpha
,
const
float16
*
A
,
const
float16
*
B
,
const
float16
beta
,
float16
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
const
half
h_alpha
=
static_cast
<
const
half
>
(
alpha
);
const
half
h_beta
=
static_cast
<
const
half
>
(
beta
);
const
half
*
h_A
=
reinterpret_cast
<
const
half
*>
(
A
);
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context
.
GetComputeCapability
(),
53
,
"cublas Hgemm requires GPU compute capability >= 53"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
strideB
,
h_A
,
lda
,
strideA
,
&
h_beta
,
h_C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
batched_gemm
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
)
{
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
));
#else
PADDLE_ENFORCE
(
false
,
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
>
void
gemv
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
float
alpha
,
const
float
*
A
,
const
float
*
B
,
const
float
beta
,
float
*
C
)
{
cublasOperation_t
cuTransA
=
(
trans_a
==
false
)
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemv
(
context
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
));
}
template
<
>
void
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
double
alpha
,
const
double
*
A
,
const
double
*
B
,
const
double
beta
,
double
*
C
)
{
cublasOperation_t
cuTransA
=
(
trans_a
==
false
)
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemv
(
context
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
));
}
template
<
>
void
axpy
<
platform
::
CUDADeviceContext
,
float
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
int
n
,
const
float
alpha
,
const
float
*
x
,
float
*
y
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSaxpy
(
context
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
));
}
template
<
>
void
axpy
<
platform
::
CUDADeviceContext
,
double
>
(
const
platform
::
CUDADeviceContext
&
context
,
const
int
n
,
const
double
alpha
,
const
double
*
x
,
double
*
y
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDaxpy
(
context
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
));
}
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
double
>;
...
...
@@ -333,10 +117,9 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
one
.
mutable_data
<
double
>
({
in_dims
[
0
]},
context
.
GetPlace
());
SetConstant
<
platform
::
CUDADeviceContext
,
double
>
set
;
set
(
context
,
&
one
,
static_cast
<
double
>
(
1.0
));
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
context
,
true
,
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
1.0
,
input
.
data
<
double
>
(),
one
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
GetBlas
<
platform
::
CUDADeviceContext
,
double
>
(
context
).
GEMV
(
true
,
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
1.0
,
input
.
data
<
double
>
(),
one
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
}
template
struct
RowwiseSum
<
platform
::
CUDADeviceContext
,
float
>;
...
...
@@ -355,10 +138,9 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
one
.
mutable_data
<
double
>
({
size
},
context
.
GetPlace
());
SetConstant
<
platform
::
CUDADeviceContext
,
double
>
set
;
set
(
context
,
&
one
,
static_cast
<
double
>
(
1.0
));
gemv
<
platform
::
CUDADeviceContext
,
double
>
(
context
,
true
,
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
0
]),
1.0
,
one
.
data
<
double
>
(),
input
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
GetBlas
<
platform
::
CUDADeviceContext
,
double
>
(
context
).
GEMV
(
true
,
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
0
]),
1.0
,
one
.
data
<
double
>
(),
input
.
data
<
double
>
(),
0.0
,
vector
->
data
<
double
>
());
}
template
struct
RowwiseMean
<
platform
::
CUDADeviceContext
,
float
>;
...
...
paddle/fluid/operators/math/math_function.h
浏览文件 @
0285a2b9
...
...
@@ -51,78 +51,6 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
namespace
paddle
{
namespace
operators
{
namespace
math
{
// Support continuous memory now
// If transA = N, and transB = N
// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
// For more detailed info, please refer to
// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
template
<
typename
DeviceContext
>
class
Blas
{
public:
explicit
Blas
(
const
DeviceContext
&
context
)
:
context_
(
context
)
{}
template
<
typename
T
>
void
GEMM
(
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
)
const
;
template
<
typename
T
>
void
GEMM
(
const
bool
transA
,
const
bool
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
const
T
beta
,
T
*
C
,
const
int
ldc
)
const
;
private:
const
DeviceContext
&
context_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
BlasT
:
private
Blas
<
DeviceContext
>
{
public:
using
Blas
<
DeviceContext
>::
Blas
;
template
<
typename
...
ARGS
>
void
GEMM
(
ARGS
...
args
)
const
{
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
)
->
template
GEMM
<
T
>(
args
...);
}
};
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
framework
::
ExecutionContext
&
exe_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
exe_ctx
.
template
device_context
<
DeviceContext
>());
}
template
<
typename
DeviceContext
,
typename
T
>
inline
BlasT
<
DeviceContext
,
T
>
GetBlas
(
const
DeviceContext
&
dev_ctx
)
{
return
BlasT
<
DeviceContext
,
T
>
(
dev_ctx
);
}
// matrix multiply with continuous memory
template
<
typename
DeviceContext
,
typename
T
>
void
matmul
(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
);
// Batched gemm
template
<
typename
DeviceContext
,
typename
T
>
void
batched_gemm
(
const
DeviceContext
&
context
,
const
CBLAS_TRANSPOSE
transA
,
const
CBLAS_TRANSPOSE
transB
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
,
const
int
batchCount
,
const
int64_t
strideA
,
const
int64_t
strideB
);
template
<
typename
DeviceContext
,
typename
T
>
void
gemv
(
const
DeviceContext
&
context
,
const
bool
trans_a
,
const
int
M
,
const
int
N
,
const
T
alpha
,
const
T
*
A
,
const
T
*
B
,
const
T
beta
,
T
*
C
);
template
<
typename
DeviceContext
,
typename
T
>
void
axpy
(
const
DeviceContext
&
context
,
const
int
n
,
const
T
alpha
,
const
T
*
x
,
T
*
y
);
template
<
typename
DeviceContext
,
typename
T
,
int
Rank
>
struct
Transpose
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
in
,
...
...
@@ -169,8 +97,3 @@ struct RowwiseMean {
}
// namespace math
}
// namespace operators
}
// namespace paddle
#include "paddle/fluid/operators/math/blas_impl.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/math/blas_impl.cu.h"
#endif
paddle/fluid/operators/math/math_function_test.cc
浏览文件 @
0285a2b9
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/operators/math/math_function.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
template
<
typename
T
>
inline
paddle
::
operators
::
math
::
BlasT
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
...
...
@@ -129,9 +130,8 @@ void GemvTest(int m, int n, bool trans) {
}
paddle
::
platform
::
CPUDeviceContext
context
(
*
cpu_place
);
paddle
::
operators
::
math
::
gemv
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
context
,
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
data_a
,
data_b
,
0.
,
data_c
);
GetBlas
<
T
>
(
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
data_a
,
data_b
,
0.
,
data_c
);
if
(
!
trans
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
...
...
paddle/fluid/operators/math/math_function_test.cu
浏览文件 @
0285a2b9
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -23,6 +24,13 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
}
}
template
<
typename
T
>
inline
paddle
::
operators
::
math
::
BlasT
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
GetBlas
(
const
paddle
::
platform
::
CUDADeviceContext
&
context
)
{
return
paddle
::
operators
::
math
::
GetBlas
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
context
);
}
TEST
(
math_function
,
notrans_mul_trans_fp32
)
{
paddle
::
framework
::
Tensor
input1
;
paddle
::
framework
::
Tensor
input1_gpu
;
...
...
@@ -42,9 +50,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle
::
framework
::
TensorCopySync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float
>
({
2
,
2
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
(
context
,
input1_gpu
,
false
,
input2_gpu
,
true
,
1
,
&
out_gpu
,
0
);
GetBlas
<
float
>
(
context
).
MatMul
(
input1_gpu
,
false
,
input2_gpu
,
true
,
1
,
&
out_gpu
,
0
);
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
...
...
@@ -81,10 +88,9 @@ TEST(math_function, notrans_mul_trans_fp16) {
out_gpu
.
mutable_data
<
paddle
::
platform
::
float16
>
({
2
,
2
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
(
context
,
input1_gpu
,
false
,
input2_gpu
,
true
,
paddle
::
platform
::
float16
(
1
),
&
out_gpu
,
paddle
::
platform
::
float16
(
0
));
GetBlas
<
paddle
::
platform
::
float16
>
(
context
).
MatMul
(
input1_gpu
,
false
,
input2_gpu
,
true
,
paddle
::
platform
::
float16
(
1
),
&
out_gpu
,
paddle
::
platform
::
float16
(
0
));
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
...
...
@@ -116,8 +122,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
out_gpu
.
mutable_data
<
float
>
({
3
,
3
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
(
context
,
input1_gpu
,
true
,
input2_gpu
,
false
,
1
,
&
out_gpu
,
0
);
GetBlas
<
float
>
(
context
).
MatMul
(
input1_gpu
,
true
,
input2_gpu
,
false
,
1
,
&
out_gpu
,
0
);
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
...
...
@@ -159,10 +165,9 @@ TEST(math_function, trans_mul_notrans_fp16) {
out_gpu
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
,
3
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
(
context
,
input1_gpu
,
true
,
input2_gpu
,
false
,
paddle
::
platform
::
float16
(
1
),
&
out_gpu
,
paddle
::
platform
::
float16
(
0
));
GetBlas
<
paddle
::
platform
::
float16
>
(
context
).
MatMul
(
input1_gpu
,
true
,
input2_gpu
,
false
,
paddle
::
platform
::
float16
(
1
),
&
out_gpu
,
paddle
::
platform
::
float16
(
0
));
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
...
...
@@ -179,13 +184,6 @@ TEST(math_function, trans_mul_notrans_fp16) {
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
8
]),
29
);
}
template
<
typename
T
>
inline
paddle
::
operators
::
math
::
BlasT
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
GetBlas
(
const
paddle
::
platform
::
CUDADeviceContext
&
context
)
{
return
paddle
::
operators
::
math
::
GetBlas
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
context
);
}
TEST
(
math_function
,
gemm_notrans_cublas_fp32
)
{
paddle
::
framework
::
Tensor
input1
;
paddle
::
framework
::
Tensor
input2
;
...
...
@@ -437,9 +435,8 @@ void GemvTest(int m, int n, bool trans) {
paddle
::
framework
::
TensorCopySync
(
mat_a
,
gpu_place
,
&
g_mat_a
);
paddle
::
framework
::
TensorCopySync
(
vec_b
,
gpu_place
,
&
g_vec_b
);
paddle
::
operators
::
math
::
gemv
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
context
,
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
GetBlas
<
T
>
(
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
paddle
::
framework
::
TensorCopySync
(
g_vec_c
,
cpu_place
,
&
vec_c
);
...
...
paddle/fluid/operators/math/matmul.h
浏览文件 @
0285a2b9
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -129,16 +129,17 @@ class MatMulFunctor {
CBLAS_TRANSPOSE
transA
=
(
trans_a
==
false
)
?
CblasNoTrans
:
CblasTrans
;
CBLAS_TRANSPOSE
transB
=
(
trans_b
==
false
)
?
CblasNoTrans
:
CblasTrans
;
auto
blas
=
GetBlas
<
DeviceContext
,
T
>
(
context
);
if
(
!
batchCount
)
{
// regular matrix multiplication
Blas
<
DeviceContext
>
(
context
).
GEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
out
->
data
<
T
>
());
blas
.
GEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
out
->
data
<
T
>
());
}
else
{
// batched matrix multiplication
b
atched_gemm
<
DeviceContext
,
T
>
(
context
,
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
()
,
beta
,
out
->
data
<
T
>
(),
batchCount
,
strideA
,
strideB
);
b
las
.
BatchedGEMM
(
transA
,
transB
,
M
,
N
,
kA
,
alpha
,
a
.
data
<
T
>
(),
b
.
data
<
T
>
(),
beta
,
out
->
data
<
T
>
(),
batchCount
,
strideA
,
strideB
);
}
}
};
...
...
paddle/fluid/operators/mul_op.h
浏览文件 @
0285a2b9
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -46,9 +46,10 @@ class MulKernel : public framework::OpKernel<T> {
if
(
z_dim
.
size
()
!=
2
)
{
z
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
math
::
matmul
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
T
>
(
1
),
z
,
static_cast
<
T
>
(
0
));
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
blas
.
MatMul
(
x_matrix
,
y_matrix
,
z
);
if
(
z_dim
.
size
()
!=
2
)
{
z
->
Resize
(
z_dim
);
}
...
...
@@ -79,6 +80,7 @@ class MulGradKernel : public framework::OpKernel<T> {
Tensor
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
Tensor
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
dx_matrix
=
dx
->
dims
().
size
()
>
2
...
...
@@ -86,8 +88,7 @@ class MulGradKernel : public framework::OpKernel<T> {
:
*
dx
;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
dout_mat
,
false
,
y_matrix
,
true
,
1
,
&
dx_matrix
,
0
);
blas
.
MatMul
(
dout_mat
,
false
,
y_matrix
,
true
,
&
dx_matrix
);
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -95,8 +96,7 @@ class MulGradKernel : public framework::OpKernel<T> {
?
framework
::
ReshapeToMatrix
(
*
dy
,
y_num_col_dims
)
:
*
dy
;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
x_matrix
,
true
,
dout_mat
,
false
,
1
,
&
dy_matrix
,
0
);
blas
.
MatMul
(
x_matrix
,
true
,
dout_mat
,
false
,
&
dy_matrix
);
}
}
};
...
...
paddle/fluid/operators/sequence_conv_op.h
浏览文件 @
0285a2b9
...
...
@@ -58,17 +58,15 @@ class SequenceConvKernel : public framework::OpKernel<T> {
// Because if padding_trainable is false, padding data should be zeros.
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
set_zero
(
dev_ctx
,
&
col
,
static_cast
<
T
>
(
0
));
math
::
ContextProjectFunctor
<
DeviceContext
,
T
>
seq_project_functor
;
seq_project_functor
(
dev_ctx
,
*
in
,
*
padding_data
,
padding_trainable
,
context_start
,
context_length
,
context_stride
,
up_pad
,
down_pad
,
&
col
);
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
col
,
false
,
filter
,
false
,
static_cast
<
T
>
(
1.0
),
out
,
static_cast
<
T
>
(
0.0
));
blas
.
MatMul
(
col
,
filter
,
out
);
}
};
...
...
@@ -99,6 +97,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
// use col_shape in the im2col calculation
framework
::
DDim
col_shape
=
{
in
->
dims
()[
0
],
sequence_width
*
context_length
};
...
...
@@ -108,8 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
col
.
mutable_data
<
T
>
(
col_shape
,
context
.
GetPlace
());
// Because if padding_trainable is false, padding data should be zeros.
set_zero
(
dev_ctx
,
&
col
,
static_cast
<
T
>
(
0
));
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
*
out_g
,
false
,
*
filter
,
true
,
T
(
1.0
),
&
col
,
T
(
1.0
));
blas
.
MatMul
(
*
out_g
,
false
,
*
filter
,
true
,
&
col
);
}
math
::
ContextProjectFunctor
<
DeviceContext
,
T
>
seq_project_functor
;
math
::
ContextProjectGradFunctor
<
DeviceContext
,
T
>
seq_project_grad_functor
;
...
...
@@ -150,8 +148,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
context_start
,
context_length
,
context_stride
,
up_pad
,
down_pad
,
&
col
);
math
::
matmul
<
DeviceContext
,
T
>
(
dev_ctx
,
col
,
true
,
out_grad
,
false
,
T
(
1.0
),
&
filter_grad
,
T
(
1.0
));
blas
.
MatMul
(
col
,
true
,
out_grad
,
false
,
&
filter_grad
);
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录