提交 8f913295 编写于 作者: T tensor-tang

fuse fc in lstm

上级 ddb05dff
...@@ -14,29 +14,37 @@ limitations under the License. */ ...@@ -14,29 +14,37 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h" #include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
"Input(Input) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("WeightX"),
PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(WeightX) of LSTM should not be null.");
"Input(Weight) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Bias"), PADDLE_ENFORCE(ctx->HasInput("Bias"),
"Input(Bias) of LSTM should not be null."); "Input(Bias) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"),
"Output(XX) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null."); "Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null."); "Output(Cell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
"Output(BatchGate) of LSTM should not be null."); "Output(BatchedGate) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
"Output(BatchGate) of LSTM should not be null."); "Output(BatchedGate) of LSTM should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
if (ctx->HasInput("H0")) { if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"), PADDLE_ENFORCE(ctx->HasInput("C0"),
...@@ -49,15 +57,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -49,15 +57,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"should be the same."); "should be the same.");
} }
int frame_size = in_dims[1] / 4; auto wx_dims = ctx->GetInputDim("WeightX");
auto w_dims = ctx->GetInputDim("Weight"); PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
PADDLE_ENFORCE_EQ(w_dims.size(), 2, "The rank of Input(Weight) should be 2."); "The rank of Input(WeightX) should be 2.");
PADDLE_ENFORCE_EQ(w_dims[0], frame_size, PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
"The first dimension of Input(Weight) " "The first dimension of Input(WeightX) "
"should be %d.",
x_dims[1]);
int frame_size = wx_dims[1] / 4;
auto wh_dims = ctx->GetInputDim("WeightH");
PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
"The rank of Input(WeightH) should be 2.");
PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
"The first dimension of Input(WeightH) "
"should be %d.", "should be %d.",
frame_size); frame_size);
PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
"The second dimension of Input(Weight) " "The second dimension of Input(WeightH) "
"should be 4 * %d.", "should be 4 * %d.",
frame_size); frame_size);
...@@ -66,36 +83,35 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -66,36 +83,35 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
if (ctx->Attrs().Get<bool>("use_peepholes")) { PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, "Do not support peephole yet.");
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection",
frame_size);
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection", "4 * %d if disable peepholes connection",
frame_size); frame_size);
}
framework::DDim out_dims({in_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims); ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchGate", in_dims); ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchCellPreAct", out_dims); ctx->SetOutputDim("BatchCellPreAct", out_dims);
ctx->ShareLoD("Input", "Hidden"); ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("Input", "Cell"); ctx->ShareLoD("X", "Cell");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
} }
framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context()); ctx.device_context());
} }
void FusionLSTMOpMaker::Make() { void FusionLSTMOpMaker::Make() {
AddInput("Input", AddInput("X",
"(LoDTensor) the first input is a LodTensor, which support " "(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in " "variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where T is the " "this LoDTensor is a matrix with shape (T X 4D), where T is the "
...@@ -130,7 +146,12 @@ void FusionLSTMOpMaker::Make() { ...@@ -130,7 +146,12 @@ void FusionLSTMOpMaker::Make() {
AddOutput("Cell", AddOutput("Cell",
"(LoDTensor) the cell state of LSTM operator. " "(LoDTensor) the cell state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`."); "The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("BatchGate", AddOutput("XX",
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddOutput("BatchedGate",
"(LoDTensor) This LoDTensor contains input gate, forget gate " "(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate after the nonlinear computation. This " "and output gate after the nonlinear computation. This "
"LoDTensor has the same shape as the reorganized input, which " "LoDTensor has the same shape as the reorganized input, which "
...@@ -219,80 +240,102 @@ inline void ReorderInitState(const DeviceContext& ctx, ...@@ -219,80 +240,102 @@ inline void ReorderInitState(const DeviceContext& ctx,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
// TODO(TJ): check mem copy perf
row_shuffle(ctx, src, index_lod, dst, indexed_src); row_shuffle(ctx, src, index_lod, dst, indexed_src);
} }
// TODO(TJ): can move to math::details
template <typename DeviceContext, typename T>
inline void SimpleFC(const math::BlasT<DeviceContext, T>& blas, const int M,
const int N, const int K, const T* A, const T* B, T* C,
const T* bias_data = NULL) {
blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), A, B,
static_cast<T>(0), C);
if (bias_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), bias_data, C + i * N);
}
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LSTMKernel : public framework::OpKernel<T> { class FuisonLSTMKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input"); auto* x = ctx.Input<LoDTensor>("X");
auto* weight = ctx.Input<Tensor>("Weight"); auto* wx = ctx.Input<Tensor>("WeightX"); // x*4D
auto* wh = ctx.Input<Tensor>("WeightH"); // D*4D
auto* bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0"); auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* cell_t0 = ctx.Input<Tensor>("C0"); auto* cell_t0 = ctx.Input<Tensor>("C0");
auto* batch_gate = ctx.Output<LoDTensor>("BatchGate"); // the result after x*Wx (size: sum_words*4D) or batched_x (size:
batch_gate->mutable_data<T>(ctx.GetPlace()); // sum_words*x)
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
hidden_out->mutable_data<T>(ctx.GetPlace());
auto* cell_out = ctx.Output<LoDTensor>("Cell"); auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
cell_out->mutable_data<T>(ctx.GetPlace()); cell_out->mutable_data<T>(ctx.GetPlace());
bool is_reverse = ctx.Attr<bool>("is_reverse"); const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>();
auto x_dims = x->dims();
auto wx_dims = wx->dims();
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& device_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
to_batch(device_ctx, *input, batch_gate, true, is_reverse); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// TODO(TJ): op test these two cases
auto in_dims = input->dims(); if (x_dims[1] > wx_dims[1]) {
int frame_size = static_cast<int>(in_dims[1] / 4); SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], x_data,
framework::DDim dims({in_dims[0], frame_size}); wx_data, xx_data, bias->data<T>());
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
if (bias) { } else {
Tensor b = *bias; to_batch(dev_ctx, *x, xx, true, is_reverse);
b.Resize({bias->numel(), 1}); SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
Tensor gate_bias = b.Slice(0, 4 * frame_size); xx_data, wx_data, batched_gate_data,
math::RowwiseAdd<DeviceContext, T> add_bias; bias->data<T>());
add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
} }
int frame_size = static_cast<int>(wx_dims[1] / 4);
framework::DDim out_dims({x_dims[0], frame_size});
math::LstmMetaValue<T> lstm_value; math::LstmMetaValue<T> lstm_value;
if (bias && ctx.Attr<bool>("use_peepholes")) { // no peephole
T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later.
lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.check_og = lstm_value.check_fg + frame_size;
} else {
lstm_value.check_ig = nullptr; lstm_value.check_ig = nullptr;
lstm_value.check_fg = nullptr; lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr; lstm_value.check_og = nullptr;
}
lstm_value.prev_state_value = nullptr; lstm_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
framework::Vector<size_t> order(batch_gate->lod()[2]); framework::Vector<size_t> order(batched_gate->lod()[2]);
if (cell_t0) { if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence // Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
// to reorder. // to reorder.
ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order, ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
&ordered_c0, true); true);
lstm_value.prev_state_value = ordered_c0.data<T>(); lstm_value.prev_state_value = ordered_c0.data<T>();
} }
// Use the local variable as here. // Use the local variable as here.
LoDTensor batch_hidden, batch_cell; LoDTensor batch_hidden, batch_cell;
auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct"); auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
batch_hidden.mutable_data<T>(dims, ctx.GetPlace()); batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell.mutable_data<T>(dims, ctx.GetPlace()); batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace()); batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batched_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t max_seq_len = batch_starts.size() - 1;
auto gate_act = math::detail::GetActivationType( auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation")); ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType( auto cell_act = math::detail::GetActivationType(
...@@ -300,12 +343,11 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -300,12 +343,11 @@ class LSTMKernel : public framework::OpKernel<T> {
auto cand_act = math::detail::GetActivationType( auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation")); ctx.Attr<std::string>("candidate_activation"));
auto blas = math::GetBlas<DeviceContext, T>(device_ctx); for (size_t n = 0; n < max_seq_len; n++) {
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor out_t = batch_hidden.Slice(bstart, bend); Tensor out_t = batch_hidden.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend); Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
...@@ -316,9 +358,11 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -316,9 +358,11 @@ class LSTMKernel : public framework::OpKernel<T> {
int pre_h_start = static_cast<int>(batch_starts[n - 1]); int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size; int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
blas.MatMul(pre_hidden_t, false, *weight, false, static_cast<T>(1.0), // TODO(TJ): use gemm directly
blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0)); &gate_t, static_cast<T>(1.0));
} else if (hidden_t0) { } else if (hidden_t0) {
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say // If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped. // the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0. // If n == 0 and there is initialized hidden state, calculate W_h * H0.
...@@ -327,10 +371,11 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -327,10 +371,11 @@ class LSTMKernel : public framework::OpKernel<T> {
// according to their length. The initialized hidden state also needs // according to their length. The initialized hidden state also needs
// to reorder. // to reorder.
Tensor ordered_h0; Tensor ordered_h0;
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order, ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
&ordered_h0, true); &ordered_h0, true);
blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0), // TODO(TJ): use gemm directly
&gate_t, static_cast<T>(1.0)); blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
} }
lstm_value.gate_value = gate_t.data<T>(); lstm_value.gate_value = gate_t.data<T>();
...@@ -338,19 +383,19 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -338,19 +383,19 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value.state_value = cell_t.data<T>(); lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>(); lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute( math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
cell_act, cand_act); cand_act);
lstm_value.prev_state_value = lstm_value.state_value; lstm_value.prev_state_value = lstm_value.state_value;
} }
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_hidden.set_lod(batch_gate->lod()); batch_hidden.set_lod(batched_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden // restore the output hidden in LoDTensor from the batch hidden
to_seq(device_ctx, batch_hidden, hidden_out); to_seq(dev_ctx, batch_hidden, hidden_out);
batch_cell.set_lod(batch_gate->lod()); batch_cell.set_lod(batched_gate->lod());
// restore the output cell state in LoDTensor from the batch cell // restore the output cell state in LoDTensor from the batch cell
to_seq(device_ctx, batch_cell, cell_out); to_seq(dev_ctx, batch_cell, cell_out);
} }
}; };
...@@ -358,9 +403,10 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -358,9 +403,10 @@ class LSTMKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker, REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fusion_lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>, fusion_lstm,
ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>); ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -15,10 +15,6 @@ limitations under the License. */ ...@@ -15,10 +15,6 @@ limitations under the License. */
#pragma once #pragma once
// #include <string> // #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
// #include "paddle/fluid/operators/math/blas.h"
// #include "paddle/fluid/operators/math/detail/activation_functions.h"
// #include "paddle/fluid/operators/math/lstm_compute.h"
// #include "paddle/fluid/operators/math/sequence2batch.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册