未验证 提交 37d49b38 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #14409 from luotao1/dam_fc

Enhance fc_op for 3-D shape tensor
...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
desc.SetInput("W", std::vector<std::string>({fc_Y_in})); desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in})); desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out_out})); desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
desc.SetType("fc"); desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
......
...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
if (type == "mul") { if (type == "mul") {
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]}); op->SetInput("Y", {inputs[1]});
op->SetAttr("x_num_col_dims", {1});
} else if (type == "elementwise_add") { } else if (type == "elementwise_add") {
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
......
...@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) { ...@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) {
void SubGraphFuser::ReplaceNodesWithSubGraphs() { void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
if (subgraph.size() <= min_subgraph_size_) continue; if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
LOG(INFO) << "detect a subgraph size " << subgraph.size(); LOG(INFO) << "detect a subgraph size " << subgraph.size();
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end()); std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
......
...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
for (int index = 0; index < block_desc.OpSize(); index++) { for (size_t index = 0; index < block_desc.OpSize(); index++) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index]; auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
......
...@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# DAM # DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS
--infer_model=${DAM_INSTALL_DIR}/model
--infer_data=${DAM_INSTALL_DIR}/data.txt
--use_analysis=0)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
......
...@@ -69,7 +69,7 @@ struct DataRecord { ...@@ -69,7 +69,7 @@ struct DataRecord {
num_lines++; num_lines++;
std::vector<std::string> data; std::vector<std::string> data;
split(line, ',', &data); split(line, ',', &data);
CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3); CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
// load turn data // load turn data
std::vector<int64_t> turns_tmp[MAX_TURN_NUM]; std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) { for (int i = 0; i < MAX_TURN_NUM; ++i) {
...@@ -197,7 +197,6 @@ TEST(Analyzer_dam, fuse_statis) { ...@@ -197,7 +197,6 @@ TEST(Analyzer_dam, fuse_statis) {
contrib::AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
if (FLAGS_use_analysis) {
int num_ops; int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis( auto fuse_statis = GetFuseStatis(
...@@ -205,7 +204,6 @@ TEST(Analyzer_dam, fuse_statis) { ...@@ -205,7 +204,6 @@ TEST(Analyzer_dam, fuse_statis) {
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 317); EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
EXPECT_EQ(num_ops, 2020); EXPECT_EQ(num_ops, 2020);
}
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
...@@ -216,11 +214,8 @@ TEST(Analyzer_dam, compare) { ...@@ -216,11 +214,8 @@ TEST(Analyzer_dam, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
if (FLAGS_use_analysis) {
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
input_slots_all);
}
} }
} // namespace inference } // namespace inference
......
...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->specify_input_name = true; cfg->specify_input_name = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
cfg->EnableMKLDNN();
#endif
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"Out(Output) of Fully Connected should not be null."); "Out(Output) of Fully Connected should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), PADDLE_ENFORCE(ctx->HasInput("W"),
"W(Input) of Fully Connected should not be null."); "W(Input) of Fully Connected should not be null.");
// NCHW
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
// IO, I=C*H*W
auto w_dims = ctx->GetInputDim("W"); auto w_dims = ctx->GetInputDim("W");
std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
if (ctx->HasInput("Bias")) { if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias"); auto bias_dims = ctx->GetInputDim("Bias");
...@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"The shape of Bias must be [1, dim]."); "The shape of Bias must be [1, dim].");
} }
} }
if (ctx->Attrs().Get<bool>("use_mkldnn")) {
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor."); "Fully Connected input should be 2-D or 4-D tensor.");
}
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
"Fully Connected input should be 2-D tensor."); "Fully Connected input should be 2-D tensor.");
PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0], int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
"Fully Connected input and weigth size do not match."); PADDLE_ENFORCE_GT(
in_dims.size(), in_num_col_dims,
"The input tensor Input's rank of FCOp should be larger than "
"in_num_col_dims.");
auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
PADDLE_ENFORCE_EQ(
in_mat_dims[1], w_dims[0],
"Fully Connected input and weigth size do not match. %s, %s");
std::vector<int64_t> output_dims;
output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
for (int i = 0; i < in_num_col_dims; ++i) {
output_dims.push_back(in_dims[i]);
}
output_dims.push_back(w_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
ctx->ShareLoD("Input", "Out"); ctx->ShareLoD("Input", "Out");
} }
...@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( ...@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
} }
void FCOpMaker::Make() { void FCOpMaker::Make() {
AddInput("Input", AddInput("Input", "(Tensor), The input tensor of fully connected operator.");
"(Tensor), The input tensor of fully connected operator with format "
"(NCHW). ");
AddInput("W", "(Tensor), The weight fc op with shape (I, O)."); AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O") AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
.AsDispensable(); .AsDispensable();
AddAttr<int>("in_num_col_dims",
"(int, default 1), The fc op can take tensors with more than "
"two dimensions as its inputs.")
.SetDefault(1)
.EqualGreaterThan(1);
AddOutput("Out", "(Tensor) The output tensor of fully connected operator. "); AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
...@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
auto output = ctx.Output<Tensor>("Out"); auto output = ctx.Output<Tensor>("Out");
auto in_dims = input->dims(); auto in_dims = input->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
auto out_dims = output->dims();
int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
math::FCCompute<platform::CPUDeviceContext, T>( math::FCCompute<platform::CPUDeviceContext, T>(
blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data, blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
bias ? bias->data<T>() : NULL); bias ? bias->data<T>() : NULL);
// TODO(TJ): fuse act // TODO(TJ): fuse act
......
...@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel { ...@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
std::vector<int64_t> out_dims; std::vector<int64_t> out_dims;
out_dims.reserve(dims.size() + 1); out_dims.reserve(dims.size() + 1);
// copy all dims except the last one // copy all dims except the last one
for (size_t i = 0u; i != dims.size() - 1; ++i) { for (int i = 0u; i != dims.size() - 1; ++i) {
out_dims.emplace_back(dims[i]); out_dims.emplace_back(dims[i]);
} }
int num_hash = ctx->Attrs().Get<int>("num_hash"); int num_hash = ctx->Attrs().Get<int>("num_hash");
......
...@@ -244,7 +244,7 @@ typename std::enable_if< ...@@ -244,7 +244,7 @@ typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas, elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) { size_t data_len, const T* in, T* out) {
for (int64_t i = 0; i < data_len; i++) { for (size_t i = 0; i < data_len; i++) {
out[i] += in[i]; out[i] += in[i];
} }
} }
......
...@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { ...@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
EXPECT_EQ(in_grad.lod(), lod); EXPECT_EQ(in_grad.lod(), lod);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
int64_t begin = in_grad.lod()[0][i]; int64_t begin = in_grad.lod()[0][i];
int64_t end = in_grad.lod()[0][i + 1]; int64_t end = in_grad.lod()[0][i + 1];
paddle::framework::Tensor tmp = in_grad.Slice(begin, end); paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t m = 0; m != second_dim; ++m) { for (int64_t m = 0; m != second_dim; ++m) {
EXPECT_EQ(tmp.data<T>()[m + j * second_dim], EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
out_grad.data<T>()[m + i * second_dim]); out_grad.data<T>()[m + i * second_dim]);
...@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { ...@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
} }
} }
} else { } else {
for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) { for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
int64_t begin = cpu_in_grad.lod()[0][i]; int64_t begin = cpu_in_grad.lod()[0][i];
int64_t end = cpu_in_grad.lod()[0][i + 1]; int64_t end = cpu_in_grad.lod()[0][i + 1];
paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end); paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t m = 0; m != second_dim; ++m) { for (int64_t m = 0; m != second_dim; ++m) {
EXPECT_EQ(tmp.data<T>()[m + j * second_dim], EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
cpu_out_grad.data<T>()[m + i * second_dim]); cpu_out_grad.data<T>()[m + i * second_dim]);
......
...@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(ids.size(), outs.size(), PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
"the number of Ids and Out should be the same"); "the number of Ids and Out should be the same");
int row_ids_size = 0; size_t row_ids_size = 0;
int row_size = 0; int row_size = 0;
int embedding_size = 0; int embedding_size = 0;
for (int i = 0; i < x_tensors.size(); ++i) { for (size_t i = 0; i < x_tensors.size(); ++i) {
const auto *x_tensor = x_tensors[i]; const auto *x_tensor = x_tensors[i];
const auto *row_id = row_ids[i]; const auto *row_id = row_ids[i];
...@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
std::unordered_map<int64_t, std::tuple<int64_t, int64_t>> std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
selected_rows_idx_map; selected_rows_idx_map;
for (int i = 0; i < x_tensors.size(); ++i) { for (size_t i = 0; i < x_tensors.size(); ++i) {
const auto *row_id = row_ids[i]; const auto *row_id = row_ids[i];
for (int j = 0; j < row_id->numel(); ++j) { for (int j = 0; j < row_id->numel(); ++j) {
...@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(), PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
"the rows and tensor map size should be the same"); "the rows and tensor map size should be the same");
for (int i = 0; i < outs.size(); ++i) { for (size_t i = 0; i < outs.size(); ++i) {
auto *out_ids = ids[i]; auto *out_ids = ids[i];
auto *out = outs[i]; auto *out = outs[i];
......
...@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> { ...@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
} else { } else {
trainer_id = *trainer_id_data; trainer_id = *trainer_id_data;
} }
PADDLE_ENFORCE_LT(trainer_id, in_list.size()); PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size());
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
out->ShareDataWith(*(in_list[trainer_id])); out->ShareDataWith(*(in_list[trainer_id]));
} }
......
...@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
out_ids.resize(outs.size()); out_ids.resize(outs.size());
// split id by their shard_num. // split id by their shard_num.
for (int i = 0; i < all_ids.size(); ++i) { for (size_t i = 0; i < all_ids.size(); ++i) {
T id = all_ids[i]; T id = all_ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num; size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id); out_ids[shard_id].push_back(id);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册