提交 f3cd2612 编写于 作者: T tensor-tang

refine fc and use the fc compute in fusion_lstm

上级 40138c4c
...@@ -15,8 +15,7 @@ limitations under the License. */ ...@@ -15,8 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fc_op.h" #include "paddle/fluid/operators/fc_op.h"
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -110,13 +109,8 @@ void FCOpMaker::Make() { ...@@ -110,13 +109,8 @@ void FCOpMaker::Make() {
AddComment(R"DOC( AddComment(R"DOC(
Fully Connected Operator. Fully Connected Operator.
The fully connected operation calculates the output based on the input, weights and bias attribute. The fully connected operation calculates the output based on the input, weights and bias.
The size of each dimension of the parameters checked in the infer-shape. The size of each dimension of the parameters checked in the infer-shape.
The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
Additional parametrs are use_mkldnn and bias_attr.
The input(X) size and output(Out) size may be diffrent.
The fully connected layer only supports MKLDNN version
)DOC"); )DOC");
} }
...@@ -133,26 +127,13 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -133,26 +127,13 @@ class FCOpKernel : public framework::OpKernel<T> {
auto in_dims = input->dims(); auto in_dims = input->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0], math::FCCompute<platform::CPUDeviceContext, T>(
static_cast<T>(1), input_data, w_data, static_cast<T>(0), blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
output_data); bias ? bias->data<T>() : NULL);
if (bias) {
const T* bias_data = bias->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int bs = 0; bs < in_dims[0]; bs++) {
blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
output_data + bs * w_dims[1]);
}
}
} }
}; };
......
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -205,23 +205,6 @@ inline void ReorderInitState(const DeviceContext& ctx, ...@@ -205,23 +205,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle(ctx, src, index_lod, dst, indexed_src); row_shuffle(ctx, src, index_lod, dst, indexed_src);
} }
// TODO(TJ): can move to math::details
template <typename DeviceContext, typename T>
inline void SimpleFC(const math::BlasT<DeviceContext, T>& blas, const int M,
const int N, const int K, const T* A, const T* B, T* C,
const T* bias_data = NULL) {
blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), A, B,
static_cast<T>(0), C);
if (bias_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), bias_data, C + i * N);
}
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> { class FuisonLSTMKernel : public framework::OpKernel<T> {
public: public:
...@@ -253,12 +236,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -253,12 +236,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (x_dims[1] > wx_dims[1]) { if (x_dims[1] > wx_dims[1]) {
SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], x_data, math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
wx_data, xx_data, bias->data<T>()); x_data, wx_data, xx_data,
bias->data<T>());
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
} else { } else {
to_batch(dev_ctx, *x, xx, true, is_reverse); to_batch(dev_ctx, *x, xx, true, is_reverse);
SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
xx_data, wx_data, batched_gate_data, xx_data, wx_data, batched_gate_data,
bias->data<T>()); bias->data<T>());
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/blas.h"
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext, typename T>
inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
const int N, const int K, const T* X, const T* W, T* Y,
const T* B = NULL) {
blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
static_cast<T>(0), Y);
if (B) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -64,27 +64,47 @@ class TestFCOp(OpTest): ...@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
self.check_output() self.check_output()
class TestFCOpBiasBoth(TestFCOp): class TestFCOpNoBias(TestFCOp):
def init_shapes(self, mb, ic, oc, h, w): def init_shapes(self, mb, ic, oc, h, w):
for with_bias in {True, False}: self.with_bias = False
self.with_bias = with_bias
self.matrix = MatrixGenerate(mb, ic, oc, h, w) self.matrix = MatrixGenerate(mb, ic, oc, h, w)
class TestFCOp1(TestFCOpBiasBoth): class TestFCOpWithBias(TestFCOp):
def init_shapes(self, mb, ic, oc, h, w):
self.with_bias = True
self.matrix = MatrixGenerate(mb, ic, oc, h, w)
class TestFCOp1(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(2, 8, 10, 1, 1) self.init_shapes(2, 8, 10, 1, 1)
class TestFCOp2(TestFCOpBiasBoth): class TestFCOp2(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(4, 5, 6, 2, 2) self.init_shapes(4, 5, 6, 2, 2)
class TestFCOp4(TestFCOpBiasBoth): class TestFCOp4(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(1, 32, 64, 3, 3) self.init_shapes(1, 32, 64, 3, 3)
class TestFCOpWithBias1(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(3, 8, 10, 2, 1)
class TestFCOpWithBias2(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(4, 5, 6, 2, 2)
class TestFCOpWithBias3(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(1, 64, 32, 3, 3)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册