提交 a58dd3e5 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into port_python3_syntax

...@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f ...@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
set(ANAKIN_COMPILE_EXTRA_FLAGS set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-Wno-error=unused-variable -Wno-unused-variable -Wno-error=unused-variable -Wno-unused-variable
-Wno-error=format-extra-args -Wno-format-extra-args -Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment -Wno-error=comment -Wno-comment
...@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
# A helper function used in Anakin, currently, to use it, one need to recursively include # A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files. # nearly all the header files.
...@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}") ...@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
endif() endif()
if (WITH_ANAKIN) if (WITH_ANAKIN)
......
...@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar ...@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', ' ...@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
......
...@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) ...@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
else() else()
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
using float16 = paddle::platform::float16;
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() { ...@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
RegisterType<cc_type>(retv, proto_type, #cc_type) RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here. // NOTE: Add your customize type here.
RegType(platform::float16, proto::VarType::FP16); RegType(float16, proto::VarType::FP16);
RegType(float, proto::VarType::FP32); RegType(float, proto::VarType::FP32);
RegType(double, proto::VarType::FP64); RegType(double, proto::VarType::FP64);
RegType(int, proto::VarType::INT32); RegType(int, proto::VarType::INT32);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST(DataType, float16) {
using paddle::framework::Tensor;
using paddle::platform::CPUPlace;
using paddle::platform::float16;
namespace f = paddle::framework;
f::proto::VarType::Type dtype = f::proto::VarType::FP16;
Tensor tensor;
CPUPlace cpu;
tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
// test fp16 tensor
EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
// test fp16 size
EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
// test debug info
std::string type = "float16";
EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace details {
class ExceptionHolder {
public:
void Catch(const platform::EnforceNotMet& exp) {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset(new platform::EnforceNotMet(exp));
type_ = kEnforceNotMet;
}
void Catch(const platform::EOFException& exp) {
std::lock_guard<std::mutex> lock(mu_);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(exp));
type_ = kEOF;
}
}
bool ExceptionCatched() const {
std::lock_guard<std::mutex> lock(mu_);
return exception_.get() != nullptr;
}
void Throw() {
std::lock_guard<std::mutex> lock(mu_);
switch (type_) {
case kNone:
break;
case kEnforceNotMet: {
auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
throw e;
break;
}
case kEOF: {
auto e = *static_cast<platform::EOFException*>(exception_.get());
throw e;
break;
}
default:
LOG(FATAL) << "Unknown exception.";
}
exception_.reset();
type_ = kNone;
}
void Clear() {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset();
type_ = kNone;
}
private:
enum ExceptionType { kNone, kEnforceNotMet, kEOF };
ExceptionType type_{kNone};
std::unique_ptr<std::exception> exception_;
mutable std::mutex mu_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
std::vector<VariableInfo> var_infos, std::vector<platform::Place> places, std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
std::unique_ptr<SSAGraphExecutor>&& underlying_executor); std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
const ir::Graph& Graph() const { return underlying_executor_->Graph(); } const ir::Graph& Graph() const override {
return underlying_executor_->Graph();
}
FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override; FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
......
...@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context // Clean run context
run_op_futures_.clear(); run_op_futures_.clear();
exception_.reset(); exception_holder_.Clear();
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
...@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout); auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) { if (timeout) {
std::unique_lock<std::mutex> l(exception_mu_); if (exception_holder_.ExceptionCatched()) {
if (exception_) {
l.unlock();
for (auto &run_op_future : run_op_futures_) { for (auto &run_op_future : run_op_futures_) {
run_op_future.wait(); run_op_future.wait();
} }
l.lock(); exception_holder_.Throw();
std::exception *exp = exception_.get();
if (dynamic_cast<platform::EOFException *>(exp)) {
auto e = *static_cast<platform::EOFException *>(exp);
throw e;
} else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
auto e = *static_cast<platform::EnforceNotMet *>(exp);
throw e;
} else {
LOG(FATAL) << "Unknown exception.";
}
} else { } else {
continue; continue;
} }
...@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EOFException ex) { } catch (platform::EOFException ex) {
std::lock_guard<std::mutex> l(exception_mu_); exception_holder_.Catch(ex);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(ex));
}
} catch (platform::EnforceNotMet ex) { } catch (platform::EnforceNotMet ex) {
std::lock_guard<std::mutex> l(exception_mu_); exception_holder_.Catch(ex);
exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) { } catch (...) {
LOG(FATAL) << "Unknown exception catched"; LOG(FATAL) << "Unknown exception catched";
} }
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <functional> #include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party #include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
...@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph); std::unique_ptr<ir::Graph> &&graph);
const ir::Graph &Graph() const { return *graph_; } const ir::Graph &Graph() const override { return *graph_; }
// Run a SSAGraph by a thread pool // Run a SSAGraph by a thread pool
// Use topological sort algorithm // Use topological sort algorithm
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
...@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
std::mutex exception_mu_; ExceptionHolder exception_holder_;
std::unique_ptr<std::exception> exception_;
std::atomic<int> running_ops_; std::atomic<int> running_ops_;
void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops, void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
......
...@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) { ...@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
for (size_t i = 0; i < sorted.size(); ++i) { for (size_t i = 0; i < sorted.size(); ++i) {
node_map[sorted[i]->Name()] = i; node_map[sorted[i]->Name()] = i;
} }
ASSERT_EQ(node_map.at("op1"), 0); ASSERT_EQ(node_map.at("op1"), 0UL);
ASSERT_EQ(node_map.at("op2"), 1); ASSERT_EQ(node_map.at("op2"), 1UL);
ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
} }
} // namespace ir } // namespace ir
......
...@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) { ...@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end()); std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
for (ir::Node *n : nodes) { for (ir::Node *n : nodes) {
if (n->Name() == "sum") { if (n->Name() == "sum") {
ASSERT_EQ(n->inputs.size(), 3); ASSERT_EQ(n->inputs.size(), 3UL);
ASSERT_EQ(n->outputs.size(), 1); ASSERT_EQ(n->outputs.size(), 1UL);
} else if (n->Name() == "test_a" || n->Name() == "test_b" || } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
n->Name() == "test_c") { n->Name() == "test_c") {
ASSERT_EQ(n->inputs.size(), 0); ASSERT_EQ(n->inputs.size(), 0UL);
ASSERT_EQ(n->outputs.size(), 1); ASSERT_EQ(n->outputs.size(), 1UL);
} else if (n->Name() == "test_out") { } else if (n->Name() == "test_out") {
ASSERT_EQ(n->inputs.size(), 1); ASSERT_EQ(n->inputs.size(), 1UL);
ASSERT_EQ(n->outputs.size(), 0); ASSERT_EQ(n->outputs.size(), 0UL);
} }
} }
ASSERT_EQ(nodes.size(), 5); ASSERT_EQ(nodes.size(), 5);
......
...@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) { ...@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"CUDNN]"); "CUDNN]");
using CUDAPlace = paddle::platform::CUDAPlace;
OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"type[CUDNN]");
} }
TEST(OpKernelType, Hash) { TEST(OpKernelType, Hash) {
......
...@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( ...@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
return OpProtoAndCheckerMaker::VariableBuilder{output}; return OpProtoAndCheckerMaker::VariableBuilder{output};
} }
void OpProtoAndCheckerMaker::Reuse(const std::string& name,
const std::string& reused_name) {
bool found = false;
proto::OpProto::Var* var;
for (auto& var : proto_->inputs()) {
if (var.name() == reused_name) {
found = true;
break;
}
}
PADDLE_ENFORCE(found == true,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched.",
name, reused_name);
found = false;
for (int i = 0; i < proto_->outputs().size(); ++i) {
var = proto_->mutable_outputs()->Mutable(i);
if (var->name() == name) {
PADDLE_ENFORCE(!var->has_reuse(),
"Output(%s) has been set reused var of %s", name,
var->reuse());
found = true;
var->set_reuse(reused_name);
break;
}
}
PADDLE_ENFORCE(found == true,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched.",
name, reused_name);
}
void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
std::unordered_set<std::string> names; std::unordered_set<std::string> names;
auto checker = [&](const std::string& name) { auto checker = [&](const std::string& name) {
......
...@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker { ...@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
VariableBuilder AddOutput(const std::string &name, VariableBuilder AddOutput(const std::string &name,
const std::string &comment); const std::string &comment);
void Reuse(const std::string &name, const std::string &reused_name);
template <typename T> template <typename T>
TypedAttrChecker<T> &AddAttr(const std::string &name, TypedAttrChecker<T> &AddAttr(const std::string &name,
const std::string &comment, const std::string &comment,
......
...@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) { ...@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
} }
class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddOutput("XOut", "output of test op").Reuse("X");
}
};
class TestInplaceProtoMaker2
: public paddle::framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "input of test op"); AddInput("X", "input of test op");
...@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { ...@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
}; };
TEST(ProtoMaker, InplaceOutput) { TEST(ProtoMaker, InplaceOutput) {
paddle::framework::proto::OpProto op_proto; paddle::framework::proto::OpProto op_proto, op_proto2;
paddle::framework::OpAttrChecker op_checker; paddle::framework::OpAttrChecker op_checker;
TestInplaceProtoMaker proto_maker; TestInplaceProtoMaker proto_maker;
ASSERT_THROW(proto_maker(&op_proto, &op_checker), TestInplaceProtoMaker2 proto_maker2;
proto_maker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
// proto_maker(&op_proto, &op_checker);
// proto_maker.Make();
// ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
} }
// normal reuse
class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddInput("Y", "input of test op");
AddOutput("Out", "output of test op");
AddOutput("XOut", "output of test op");
// avoid destructor exception.
// Validate();
TestReuse();
}
virtual void TestReuse() {}
};
// test duplicate reuse error
class TestReuseProtoMaker2 : public TestReuseProtoMaker {
public:
void TestReuse() {
Reuse("Out", "X");
Reuse("Out", "Y");
}
};
// NotExists Input
class TestReuseProtoMaker3 : public TestReuseProtoMaker {
public:
void TestReuse() {
Reuse("Out", "NotExists");
Reuse("XOut", "X");
}
};
// NotExists Output
class TestReuseProtoMaker4 : public TestReuseProtoMaker {
public:
void TestReuse() { Reuse("NotExists", "X"); }
};
TEST(ProtoMaker, Reuse) {
paddle::framework::proto::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker proto_maker;
proto_maker(&op_proto, &op_checker);
}
// NOTE(dzhwinter):
// There is a Fatal CHECK on base class destructor, which will call abort inside
// instead of
// throw an exception. If we throw an exception in Make(), we will trigger the
// CHECK and terminate the tests.
//
// I had tried to replace the default CHECK with a exception, however, it's
// still not supported by glog.
// the details:
// https://github.com/google/glog/issues/249
// https://github.com/facebookresearch/TensorComprehensions/issues/351
/*
TEST(ProtoMaker, ReuseWithException) {
paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker2 proto_maker2;
TestReuseProtoMaker3 proto_maker3;
TestReuseProtoMaker4 proto_maker4;
EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
paddle::platform::EnforceNotMet);
}
void FailureFunction() {
throw std::runtime_error("Check failed in destructor.");
// return 0;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::InstallFailureFunction(&FailureFunction);
return RUN_ALL_TESTS();
}
*/
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
...@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
} }
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1});
}
return tensor.dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) { if (get_actual_dim) {
return var->Get<SelectedRows>().value().dims(); return var->Get<SelectedRows>().value().dims();
...@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
} }
} }
static std::string GetDtype(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return "";
}
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return "";
}
return DataTypeToString(ToDataType(tensor.type()));
} else if (var->IsType<SelectedRows>()) {
return DataTypeToString(
ToDataType(var->Get<SelectedRows>().value().type()));
} else {
return "";
}
}
static int GetRowSize(const Scope& scope, const std::string& name) { static int GetRowSize(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
if (var == nullptr) { if (var == nullptr) {
...@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().lod(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod;
}
return tensor.lod();
} else { } else {
return default_lod; return default_lod;
} }
...@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if (row_size >= 0) { if (row_size >= 0) {
ss << "[row_size=" << row_size << "]"; ss << "[row_size=" << row_size << "]";
} }
std::string dtype = GetDtype(*scope, input.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "[" << GetDims(*scope, input.second[i], true) << "]";
ss << "(" << GetLoD(*scope, input.second[i]) << ")"; ss << "(" << GetLoD(*scope, input.second[i]) << ")";
} }
......
...@@ -82,7 +82,7 @@ class Tensor { ...@@ -82,7 +82,7 @@ class Tensor {
template <typename T> template <typename T>
const T* data() const; const T* data() const;
bool IsInitialized() const; inline bool IsInitialized() const;
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
#include "paddle/fluid/platform/float16.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
...@@ -213,3 +214,17 @@ TEST(Tensor, Layout) { ...@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
src.set_layout(framework::DataLayout::kAnyLayout); src.set_layout(framework::DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
} }
TEST(Tensor, FP16) {
using platform::float16;
framework::Tensor src;
float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float16>(i);
}
EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
// EXPECT a human readable error message
// src.data<uint8_t>();
// Tensor holds the wrong type, it holds N6paddle8platform7float16E at
// [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
}
...@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph ...@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
tensorrt_subgraph_node_mark_pass.cc tensorrt_subgraph_node_mark_pass.cc
analyzer.cc analyzer.cc
helper.cc helper.cc
model_store_pass.cc
DEPS framework_proto proto_desc) DEPS framework_proto proto_desc)
cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
...@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ ...@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
...@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, ...@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string(inference_analysis_graphviz_log_root, "./", DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs."); "Graphviz debuger for data flow graphs.");
DEFINE_string(inference_analysis_output_storage_path, "",
"optimized model output path");
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
} }
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_inference_analysis_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass);
}
} }
std::string repr() const override { return "dfg-pass-manager"; } std::string repr() const override { return "dfg-pass-manager"; }
......
...@@ -16,28 +16,23 @@ limitations under the License. */ ...@@ -16,28 +16,23 @@ limitations under the License. */
/* /*
* This file contains Analyzer, an class that exposed as a library that analyze * This file contains Analyzer, an class that exposed as a library that analyze
* and optimize * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to * multiple flags to
* control whether * control whether an process is applied on the program.
* an process is applied on the program.
* *
* The processes are called Passes in analysis, the Passes are placed in a * The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to * Fluid ProgramDesc to
* a data flow * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
* graph to a * pipeline can be any Passes
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes * which take a node or data flow graph as input.
* which take a
* node or data flow graph as input.
* *
* The Analyzer can be used in two methods, the first is a executable file which * The Analyzer can be used in two methods, the first is a executable file which
* can be used to * can be used to pre-process the inference model and can be controlled by
* pre-process the inference model and can be controlled by passing difference * passing difference command flags;
* command flags;
* the other way is to compose inside the inference API as a runtime pre-process * the other way is to compose inside the inference API as a runtime pre-process
* phase in the * phase in the inference service.
* inference service.
*/ */
#include <gflags/gflags.h> #include <gflags/gflags.h>
...@@ -50,6 +45,7 @@ namespace paddle { ...@@ -50,6 +45,7 @@ namespace paddle {
// flag if not available. // flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root); DECLARE_string(inference_analysis_graphviz_log_root);
DECLARE_string(inference_analysis_output_storage_path);
namespace inference { namespace inference {
namespace analysis { namespace analysis {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file implements analysizer -- an executation help to analyze and
* optimize trained model.
*/
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
using paddle::inference::analysis::Analyzer;
using paddle::inference::analysis::Argument;
Argument argument;
Analyzer analyzer;
analyzer.Run(&argument);
return 0;
}
...@@ -20,14 +20,18 @@ namespace paddle { ...@@ -20,14 +20,18 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, analysis_without_tensorrt) { TEST(Analyzer, analysis_without_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
TEST_F(DFG_Tester, analysis_with_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
...@@ -36,6 +37,16 @@ namespace analysis { ...@@ -36,6 +37,16 @@ namespace analysis {
* All the fields should be registered here for clearness. * All the fields should be registered here for clearness.
*/ */
struct Argument { struct Argument {
Argument() = default;
explicit Argument(const std::string& fluid_model_dir)
: fluid_model_dir(new std::string(fluid_model_dir)) {}
// The directory of the trained model.
std::unique_ptr<std::string> fluid_model_dir;
// The path of `__model__` and `param`, this is used when the file name of
// model and param is changed.
std::unique_ptr<std::string> fluid_model_program_path;
std::unique_ptr<std::string> fluid_model_param_path;
// The graph that process by the Passes or PassManagers. // The graph that process by the Passes or PassManagers.
std::unique_ptr<DataFlowGraph> main_dfg; std::unique_ptr<DataFlowGraph> main_dfg;
...@@ -44,6 +55,9 @@ struct Argument { ...@@ -44,6 +55,9 @@ struct Argument {
// The processed program desc. // The processed program desc.
std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc; std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
// The output storage path of ModelStorePass.
std::unique_ptr<std::string> model_output_store_path;
}; };
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
......
...@@ -36,6 +36,8 @@ namespace analysis { ...@@ -36,6 +36,8 @@ namespace analysis {
/* /*
* DataFlowGraph - A container of Value and Function Nodes. * DataFlowGraph - A container of Value and Function Nodes.
*
* This is the base graph for any other type of graphs, such as SSA or CFG.
*/ */
struct DataFlowGraph { struct DataFlowGraph {
NodeMap nodes; NodeMap nodes;
...@@ -174,7 +176,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -174,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph. // sub-graph.
std::pair<std::vector<Node *>, std::vector<Node *>> std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -20,7 +20,7 @@ namespace inference { ...@@ -20,7 +20,7 @@ namespace inference {
namespace analysis { namespace analysis {
TEST(DataFlowGraph, BFS) { TEST(DataFlowGraph, BFS) {
auto desc = LoadProgramDesc(); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
dfg.Build(); dfg.Build();
...@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) { ...@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
} }
TEST(DataFlowGraph, DFS) { TEST(DataFlowGraph, DFS) {
auto desc = LoadProgramDesc(); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
dfg.Build(); dfg.Build();
GraphTraits<DataFlowGraph> trait(&dfg); GraphTraits<DataFlowGraph> trait(&dfg);
......
...@@ -26,21 +26,21 @@ namespace paddle { ...@@ -26,21 +26,21 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, Test) { TEST(DataFlowGraph, Test) {
DataFlowGraph graph; Argument argument(FLAGS_inference_model_dir);
FluidToDataFlowGraphPass pass0; FluidToDataFlowGraphPass pass0;
DataFlowGraphToFluidPass pass1; DataFlowGraphToFluidPass pass1;
ASSERT_TRUE(pass0.Initialize(&argument)); ASSERT_TRUE(pass0.Initialize(&argument));
ASSERT_TRUE(pass1.Initialize(&argument)); ASSERT_TRUE(pass1.Initialize(&argument));
pass0.Run(&graph); pass0.Run(argument.main_dfg.get());
pass1.Run(&graph); pass1.Run(argument.main_dfg.get());
pass0.Finalize(); pass0.Finalize();
pass1.Finalize(); pass1.Finalize();
LOG(INFO) << graph.nodes.size(); LOG(INFO) << argument.main_dfg->nodes.size();
} }
}; // namespace analysis }; // namespace analysis
......
...@@ -23,12 +23,18 @@ namespace paddle { ...@@ -23,12 +23,18 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
auto dfg = ProgramDescToDFG(*argument.origin_program_desc); Argument argument(FLAGS_inference_model_dir);
FluidToDataFlowGraphPass pass0;
ASSERT_TRUE(pass0.Initialize(&argument));
pass0.Run(argument.main_dfg.get());
// auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
DFG_GraphvizDrawPass::Config config("./", "test"); DFG_GraphvizDrawPass::Config config("./", "test");
DFG_GraphvizDrawPass pass(config); DFG_GraphvizDrawPass pass(config);
pass.Initialize(&argument); pass.Initialize(&argument);
pass.Run(&dfg); pass.Run(argument.main_dfg.get());
// test content // test content
std::ifstream file("./0-graph_test.dot"); std::ifstream file("./0-graph_test.dot");
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -25,8 +26,20 @@ namespace analysis { ...@@ -25,8 +26,20 @@ namespace analysis {
bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument); ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); if (argument->origin_program_desc) {
PADDLE_ENFORCE(argument); LOG(WARNING) << "argument's origin_program_desc is already set, might "
"duplicate called";
}
if (!argument->fluid_model_program_path) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
argument->fluid_model_program_path.reset(
new std::string(*argument->fluid_model_dir + "/__model__"));
}
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
auto program = LoadProgramDesc(*argument->fluid_model_program_path);
argument->origin_program_desc.reset(
new framework::proto::ProgramDesc(program));
if (!argument->main_dfg) { if (!argument->main_dfg) {
argument->main_dfg.reset(new DataFlowGraph); argument->main_dfg.reset(new DataFlowGraph);
} }
...@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE(graph); PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(desc_); PADDLE_ENFORCE(desc_);
// insert vars // insert vars
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std::unordered_map<std::string, size_t> var2id; std::unordered_map<std::string, size_t> var2id;
auto &main_block = desc_->blocks(framework::kRootBlockIndex); auto &main_block = desc_->blocks(framework::kRootBlockIndex);
for (int i = 0; i < main_block.vars_size(); i++) { for (int i = 0; i < main_block.vars_size(); i++) {
...@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
var2id[var.name()] = v->id(); var2id[var.name()] = v->id();
} }
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std::unordered_set<Node *> unique_written_vars;
for (int i = 0; i < main_block.ops_size(); i++) { for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i); const auto &op = main_block.ops(i);
auto *o = graph->nodes.Create(Node::Type::kFunction); auto *o = graph->nodes.Create(Node::Type::kFunction);
...@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o->SetPbMsg(op.SerializeAsString()); o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs // set inputs and outputs
std::unordered_set<Node *> inlinks;
for (int j = 0; j < op.inputs_size(); j++) { for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j); auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) { for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o); in->outlinks.push_back(o);
o->inlinks.push_back(in); o->inlinks.push_back(in);
inlinks.insert(in);
} }
} }
for (int j = 0; j < op.outputs_size(); j++) { for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j); auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) { for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
if (inlinks.count(out)) { if (unique_written_vars.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a). // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = graph->nodes.Create(Node::Type::kValue); auto *out_alias = graph->nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name()); out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc()); out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg()); out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] = out_alias->id(); // update a -> a0 var2id[out_alias->name()] =
out_alias->id(); // update variable's alias Node
LOG(INFO) << "loop found in graph, create SSA alias node [" LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]"; << out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias; out = out_alias;
} }
out->inlinks.push_back(o); out->inlinks.push_back(o);
o->outlinks.push_back(out); o->outlinks.push_back(out);
unique_written_vars.insert(out);
} }
} }
} }
......
...@@ -30,7 +30,7 @@ namespace inference { ...@@ -30,7 +30,7 @@ namespace inference {
namespace analysis { namespace analysis {
/* /*
* Transform a FluidDesc to a data flow graph. * Transform a FluidDesc to a SSA.
*/ */
class FluidToDataFlowGraphPass final : public DataFlowGraphPass { class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
public: public:
......
...@@ -21,8 +21,9 @@ namespace paddle { ...@@ -21,8 +21,9 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, Init) { TEST(FluidToDataFlowGraphPass, Test) {
FluidToDataFlowGraphPass pass; FluidToDataFlowGraphPass pass;
Argument argument(FLAGS_inference_model_dir);
pass.Initialize(&argument); pass.Initialize(&argument);
pass.Run(argument.main_dfg.get()); pass.Run(argument.main_dfg.get());
// Analysis is sensitive to ProgramDesc, careful to change the original model. // Analysis is sensitive to ProgramDesc, careful to change the original model.
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdio> #include <cstdio>
#include <fstream>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <unordered_map> #include <unordered_map>
...@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) { ...@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
} }
} }
static framework::proto::ProgramDesc LoadProgramDesc(
const std::string &model_path) {
std::ifstream fin(model_path, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
fin.seekg(0, std::ios::end);
std::string buffer(fin.tellg(), ' ');
fin.seekg(0, std::ios::beg);
fin.read(&buffer[0], buffer.size());
fin.close();
framework::proto::ProgramDesc program_desc;
program_desc.ParseFromString(buffer);
return program_desc;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
void ModelStorePass::Run(DataFlowGraph *x) {
if (!argument_->fluid_model_param_path) {
PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
argument_->fluid_model_param_path.reset(
new std::string(*argument_->fluid_model_dir + "param"));
}
PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
// Directly copy param file to destination.
std::stringstream ss;
// NOTE these commands only works on linux.
ss << "mkdir -p " << *argument_->model_output_store_path;
LOG(INFO) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
ss.str("");
ss << "cp " << *argument_->fluid_model_dir << "/*"
<< " " << *argument_->model_output_store_path;
LOG(INFO) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
// Store program
PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first.");
const std::string program_output_path =
*argument_->model_output_store_path + "/__model__";
std::ofstream file(program_output_path, std::ios::binary);
PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
program_output_path);
const std::string serialized_message =
argument_->transformed_program_desc->SerializeAsString();
file.write(serialized_message.c_str(), serialized_message.size());
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines ModelStorePass, which store the runtime DFG to a Paddle
* model in the disk, and that model can be reloaded for prediction.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
namespace analysis {
class ModelStorePass : public DataFlowGraphPass {
public:
bool Initialize(Argument* argument) override {
if (!argument) {
LOG(ERROR) << "invalid argument";
return false;
}
argument_ = argument;
return true;
}
void Run(DataFlowGraph* x) override;
std::string repr() const override { return "DFG-store-pass"; }
std::string description() const override {
return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
model in the disk, and that model can be reloaded for prediction again.)DD";
}
private:
Argument* argument_{nullptr};
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
namespace paddle {
namespace inference {
namespace analysis {
DEFINE_string(inference_model_dir, "", "Model path");
TEST(DFG_StorePass, test) {
Analyzer analyzer;
Argument argument(FLAGS_inference_model_dir);
argument.model_output_store_path.reset(
new std::string("./_dfg_store_pass_tmp"));
// disable storage in alalyzer
FLAGS_inference_analysis_output_storage_path = "";
analyzer.Run(&argument);
ModelStorePass pass;
pass.Initialize(&argument);
pass.Run(argument.main_dfg.get());
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -50,6 +50,7 @@ class Pass { ...@@ -50,6 +50,7 @@ class Pass {
// Create a debugger Pass that draw the DFG by graphviz toolkit. // Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
virtual void Run() { LOG(FATAL) << "not valid"; }
// Run on a single Node. // Run on a single Node.
virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
// Run on a single Function. // Run on a single Function.
......
...@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass { ...@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
std::string description() const override { return "some doc"; } std::string description() const override { return "some doc"; }
}; };
TEST_F(DFG_Tester, DFG_pass_manager) { TEST(PassManager, DFG_pass_manager) {
TestDfgPassManager manager; TestDfgPassManager manager;
DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
...@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) { ...@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
Argument argument(FLAGS_inference_model_dir);
ASSERT_TRUE(&argument); ASSERT_TRUE(&argument);
ASSERT_TRUE(manager.Initialize(&argument)); ASSERT_TRUE(manager.Initialize(&argument));
manager.RunAll(); manager.RunAll();
} }
TEST_F(DFG_Tester, Node_pass_manager) { TEST(PassManager, Node_pass_manager) {
Argument argument(FLAGS_inference_model_dir);
// Pre-process: initialize the DFG with the ProgramDesc first. // Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass pass0; FluidToDataFlowGraphPass pass0;
pass0.Initialize(&argument); pass0.Initialize(&argument);
......
...@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { ...@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
return false; return false;
}; };
TEST_F(DFG_Tester, Split) { TEST(SubGraphSplitter, Split) {
auto desc = LoadProgramDesc(); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
LOG(INFO) << "spliter\n" << dfg.DotString(); LOG(INFO) << "spliter\n" << dfg.DotString();
...@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) { ...@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
ASSERT_EQ(subgraphs.back().size(), 6UL); ASSERT_EQ(subgraphs.back().size(), 6UL);
} }
TEST_F(DFG_Tester, Fuse) { TEST(SubGraphSplitter, Fuse) {
auto desc = LoadProgramDesc(); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
size_t count0 = dfg.nodes.size(); size_t count0 = dfg.nodes.size();
......
...@@ -22,11 +22,11 @@ namespace paddle { ...@@ -22,11 +22,11 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { TEST(TensorRTSubgraphNodeMarkPass, test) {
// init // init
FluidToDataFlowGraphPass pass; FluidToDataFlowGraphPass pass;
Argument argument(FLAGS_inference_model_dir);
ASSERT_TRUE(pass.Initialize(&argument)); ASSERT_TRUE(pass.Initialize(&argument));
argument.main_dfg.reset(new DataFlowGraph);
pass.Run(argument.main_dfg.get()); pass.Run(argument.main_dfg.get());
TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
...@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { ...@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
for (auto& node : argument.main_dfg->nodes.nodes()) { for (auto& node : argument.main_dfg->nodes.nodes()) {
counter += node->attr(ATTR_supported_by_tensorrt).Bool(); counter += node->attr(ATTR_supported_by_tensorrt).Bool();
} }
ASSERT_EQ(counter, 2);
LOG(INFO) << counter << " nodes marked"; LOG(INFO) << counter << " nodes marked";
} }
......
...@@ -25,7 +25,7 @@ namespace analysis { ...@@ -25,7 +25,7 @@ namespace analysis {
DEFINE_string(dot_dir, "./", ""); DEFINE_string(dot_dir, "./", "");
TEST_F(DFG_Tester, tensorrt_single_pass) { TEST(TensorRTSubGraphPass, main) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "sigmoid"}); {"elementwise_add", "mul", "sigmoid"});
SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
...@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { ...@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
return false; return false;
}; };
LOG(INFO) << "init"; Argument argument(FLAGS_inference_model_dir);
DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
...@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { ...@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
FluidToDataFlowGraphPass pass0; FluidToDataFlowGraphPass pass0;
TensorRTSubGraphPass trt_pass(std::move(teller)); TensorRTSubGraphPass trt_pass(std::move(teller));
LOG(INFO) << "Initialize";
dfg_pass.Initialize(&argument); dfg_pass.Initialize(&argument);
dfg_pass1.Initialize(&argument); dfg_pass1.Initialize(&argument);
pass0.Initialize(&argument); pass0.Initialize(&argument);
trt_pass.Initialize(&argument); trt_pass.Initialize(&argument);
LOG(INFO) << "Run";
argument.main_dfg.reset(new DataFlowGraph); argument.main_dfg.reset(new DataFlowGraph);
pass0.Run(argument.main_dfg.get()); pass0.Run(argument.main_dfg.get());
dfg_pass.Run(argument.main_dfg.get()); dfg_pass.Run(argument.main_dfg.get());
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -32,27 +32,12 @@ namespace analysis { ...@@ -32,27 +32,12 @@ namespace analysis {
DEFINE_string(inference_model_dir, "", "inference test model dir"); DEFINE_string(inference_model_dir, "", "inference test model dir");
static framework::proto::ProgramDesc LoadProgramDesc(
const std::string& model_dir = FLAGS_inference_model_dir) {
std::string msg;
std::string net_file = FLAGS_inference_model_dir + "/__model__";
std::ifstream fin(net_file, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
fin.seekg(0, std::ios::end);
msg.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(msg.at(0)), msg.size());
fin.close();
framework::proto::ProgramDesc program_desc;
program_desc.ParseFromString(msg);
return program_desc;
}
static DataFlowGraph ProgramDescToDFG( static DataFlowGraph ProgramDescToDFG(
const framework::proto::ProgramDesc& desc) { const framework::proto::ProgramDesc& desc) {
DataFlowGraph graph; DataFlowGraph graph;
FluidToDataFlowGraphPass pass; FluidToDataFlowGraphPass pass;
Argument argument; Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
pass.Initialize(&argument); pass.Initialize(&argument);
pass.Run(&graph); pass.Run(&graph);
...@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG( ...@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
class DFG_Tester : public ::testing::Test { class DFG_Tester : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
} }
......
...@@ -19,6 +19,7 @@ endif(APPLE) ...@@ -19,6 +19,7 @@ endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api) set(inference_deps paddle_inference_api paddle_fluid_api)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
endif() endif()
...@@ -63,6 +64,8 @@ endif() ...@@ -63,6 +64,8 @@ endif()
if (WITH_ANAKIN) # only needed in CI if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively(${ANAKIN_INCLUDE})
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
...@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI ...@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
if (WITH_TESTING) if (WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api) DEPS inference_anakin_api_shared)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING) endif(WITH_TESTING)
endif() endif()
...@@ -18,26 +18,36 @@ ...@@ -18,26 +18,36 @@
namespace paddle { namespace paddle {
PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( template <typename Target>
PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) { const AnakinConfig &config) {
CHECK(Init(config)); CHECK(Init(config));
} }
bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
if (!(graph_.load(config.model_file))) { if (!(graph_.load(config.model_file))) {
LOG(FATAL) << "fail to load graph from " << config.model_file;
return false; return false;
} }
graph_.ResetBatchSize("input_0", config.max_batch_size); auto inputs = graph_.get_ins();
for (auto &input_str : inputs) {
graph_.ResetBatchSize(input_str, config.max_batch_size);
}
// optimization for graph // optimization for graph
if (!(graph_.Optimize())) { if (!(graph_.Optimize())) {
return false; return false;
} }
// construct executer // construct executer
executor_.init(graph_); if (executor_p_ == nullptr) {
executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
anakin::Precision::FP32>(graph_, true);
}
return true; return true;
} }
bool PaddleInferenceAnakinPredictor::Run( template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Run(
const std::vector<PaddleTensor> &inputs, const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, int batch_size) { std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) { for (const auto &input : inputs) {
...@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
<< "'s type is not float"; << "'s type is not float";
return false; return false;
} }
auto d_tensor_in_p = executor_.get_in(input.name); auto d_tensor_in_p = executor_p_->get_in(input.name);
auto net_shape = d_tensor_in_p->valid_shape();
if (net_shape.size() != input.shape.size()) {
LOG(ERROR) << " input " << input.name
<< "'s shape size should be equal to that of net";
return false;
}
int sum = 1;
for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
if (sum > net_shape.count()) {
graph_.Reshape(input.name, input.shape);
delete executor_p_;
executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
anakin::Precision::FP32>(graph_, true);
d_tensor_in_p = executor_p_->get_in(input.name);
}
anakin::saber::Shape tmp_shape;
for (auto s : input.shape) {
tmp_shape.push_back(s);
}
d_tensor_in_p->reshape(tmp_shape);
float *d_data_p = d_tensor_in_p->mutable_data(); float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()), if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float), d_tensor_in_p->valid_size() * sizeof(float),
...@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
} }
cudaStreamSynchronize(NULL); cudaStreamSynchronize(NULL);
} }
cudaDeviceSynchronize();
executor_.prediction(); executor_p_->prediction();
cudaDeviceSynchronize();
if (output_data->empty()) { if (output_data->empty()) {
LOG(ERROR) << "At least one output should be set with tensors' names."; LOG(ERROR) << "At least one output should be set with tensors' names.";
return false; return false;
} }
for (auto &output : *output_data) { for (auto &output : *output_data) {
auto *tensor = executor_.get_out(output.name); auto *tensor = executor_p_->get_out(output.name);
output.shape = tensor->shape(); output.shape = tensor->valid_shape();
if (output.data.length() < tensor->valid_size() * sizeof(float)) { if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float)); output.data.Resize(tensor->valid_size() * sizeof(float));
} }
...@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
return true; return true;
} }
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32> template <typename Target>
&PaddleInferenceAnakinPredictor::get_executer() { anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
return executor_; &PaddleInferenceAnakinPredictor<Target>::get_executer() {
return *executor_p_;
} }
// the cloned new Predictor of anakin share the same net weights from original // the cloned new Predictor of anakin share the same net weights from original
// Predictor // Predictor
std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() { template <typename Target>
std::unique_ptr<PaddlePredictor>
PaddleInferenceAnakinPredictor<Target>::Clone() {
VLOG(3) << "Anakin Predictor::clone"; VLOG(3) << "Anakin Predictor::clone";
std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor()); std::unique_ptr<PaddlePredictor> cls(
new PaddleInferenceAnakinPredictor<Target>());
// construct executer from other graph // construct executer from other graph
auto anakin_predictor_p = auto anakin_predictor_p =
dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get()); dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
if (!anakin_predictor_p) { if (!anakin_predictor_p) {
LOG(ERROR) << "fail to call Init"; LOG(ERROR) << "fail to call Init";
return nullptr; return nullptr;
...@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() { ...@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return std::move(cls); return std::move(cls);
} }
template class PaddleInferenceAnakinPredictor<anakin::NV>;
template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor. // A factory to help create difference predictor.
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create."; VLOG(3) << "Anakin Predictor create.";
std::unique_ptr<PaddlePredictor> x( if (config.target_type == AnakinConfig::NVGPU) {
new PaddleInferenceAnakinPredictor(config)); VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
return x; std::unique_ptr<PaddlePredictor> x(
} new PaddleInferenceAnakinPredictor<anakin::NV>(config));
return x;
} else if (config.target_type == AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::X86>(config));
return x;
} else {
VLOG(3) << "Anakin Predictor create on unknown platform.";
return nullptr;
}
};
} // namespace paddle } // namespace paddle
...@@ -20,14 +20,16 @@ limitations under the License. */ ...@@ -20,14 +20,16 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "saber/core/shape.h"
#include "saber/saber_types.h" #include "saber/saber_types.h"
namespace paddle { namespace paddle {
template <typename Target>
class PaddleInferenceAnakinPredictor : public PaddlePredictor { class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public: public:
PaddleInferenceAnakinPredictor() {} PaddleInferenceAnakinPredictor() {}
...@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>& anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
get_executer(); get_executer();
~PaddleInferenceAnakinPredictor() override{}; ~PaddleInferenceAnakinPredictor() override {
delete executor_p_;
executor_p_ = nullptr;
};
private: private:
bool Init(const AnakinConfig& config); bool Init(const AnakinConfig& config);
anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT, anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
anakin::Precision::FP32>
graph_; graph_;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32> anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
executor_; executor_p_{nullptr};
AnakinConfig config_; AnakinConfig config_;
}; };
......
...@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model."); DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
namespace paddle { namespace paddle {
AnakinConfig GetConfig() { AnakinConfig GetConfig() {
AnakinConfig config; AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::NVGPU;
config.model_file = FLAGS_model; config.model_file = FLAGS_model;
config.device = 0; config.device = 0;
config.max_batch_size = 1; config.max_batch_size = 1;
...@@ -36,28 +38,27 @@ TEST(inference, anakin) { ...@@ -36,28 +38,27 @@ TEST(inference, anakin) {
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config); CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
float data[1 * 3 * 224 * 224] = {1.0f}; float data[1 * 3 * 224 * 224] = {1.0f};
PaddleTensor tensor;
PaddleTensor tensor{.name = "input_0", tensor.name = "input_0";
.shape = std::vector<int>({1, 3, 224, 224}), tensor.shape = std::vector<int>({1, 3, 224, 224});
.data = PaddleBuf(data, sizeof(data)), tensor.data = PaddleBuf(data, sizeof(data));
.dtype = PaddleDType::FLOAT32}; tensor.dtype = PaddleDType::FLOAT32;
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds; std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
paddle_tensor_feeds.emplace_back(std::move(tensor));
PaddleTensor tensor_out{.name = "prob_out", PaddleTensor tensor_out;
.shape = std::vector<int>({1000, 1}), tensor_out.name = "prob_out";
.data = PaddleBuf(), tensor_out.shape = std::vector<int>({});
.dtype = PaddleDType::FLOAT32}; tensor_out.data = PaddleBuf();
tensor_out.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs(1, tensor_out);
outputs.emplace_back(std::move(tensor_out));
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < 1000; ++j) { for (size_t j = 0; j < outputs[0].data.length(); ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j]; LOG(INFO) << "output[" << j << "]: " << data_o[j];
} }
} }
......
...@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length()); inputs[i].data.length());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {
lod.emplace_back(level);
}
input.set_lod(lod);
feeds->push_back(input); feeds->push_back(input);
} }
return true; return true;
...@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch( ...@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
buffer.Resize(sizeof(float) * data.size()); buffer.Resize(sizeof(float) * data.size());
} }
std::memcpy(buffer.data(), data.data(), buffer.length()); std::memcpy(buffer.data(), data.data(), buffer.length());
// copy LoD
for (const auto &level : fetchs[i].lod()) {
outputs->at(i).lod.emplace_back(level);
}
outputs->at(i).dtype = PaddleDType::FLOAT32; outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy. // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
} }
......
...@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void OptimizeInferenceProgram() { void OptimizeInferenceProgram() {
// Analyze inference_program // Analyze inference_program
Argument argument; Argument argument;
if (!config_.model_dir.empty()) {
argument.fluid_model_dir.reset(new std::string(config_.model_dir));
} else {
PADDLE_ENFORCE(
!config_.param_file.empty(),
"Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file.empty());
argument.fluid_model_program_path.reset(
new std::string(config_.prog_file));
argument.fluid_model_param_path.reset(
new std::string(config_.param_file));
}
argument.origin_program_desc.reset( argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument); Singleton<Analyzer>::Global().Run(&argument);
......
...@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
std::vector<int64_t> data(20); std::vector<int64_t> data(20);
for (int i = 0; i < 20; i++) data[i] = i; for (int i = 0; i < 20; i++) data[i] = i;
PaddleTensor tensor{ PaddleTensor tensor;
.name = "", tensor.shape = std::vector<int>({10, 1});
.shape = std::vector<int>({10, 1}), tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), tensor.dtype = PaddleDType::INT64;
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor); std::vector<PaddleTensor> slots(4, tensor);
......
...@@ -47,10 +47,10 @@ void Main(bool use_gpu) { ...@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
//# 2. Prepare input. //# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor{.name = "", PaddleTensor tensor;
.shape = std::vector<int>({4, 1}), tensor.shape = std::vector<int>({4, 1});
.data = PaddleBuf(data, sizeof(data)), tensor.data = PaddleBuf(data, sizeof(data));
.dtype = PaddleDType::INT64}; tensor.dtype = PaddleDType::INT64;
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor); std::vector<PaddleTensor> slots(4, tensor);
...@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
for (int batch_id = 0; batch_id < num_batches; ++batch_id) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
// 2. Dummy Input Data // 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor{.name = "", PaddleTensor tensor;
.shape = std::vector<int>({4, 1}), tensor.shape = std::vector<int>({4, 1});
.data = PaddleBuf(data, sizeof(data)), tensor.data = PaddleBuf(data, sizeof(data));
.dtype = PaddleDType::INT64}; tensor.dtype = PaddleDType::INT64;
std::vector<PaddleTensor> inputs(4, tensor); std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
// 3. Run // 3. Run
......
...@@ -20,8 +20,8 @@ limitations under the License. */ ...@@ -20,8 +20,8 @@ limitations under the License. */
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "utils.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
...@@ -123,11 +123,11 @@ void Main(bool use_gpu) { ...@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
file.close(); file.close();
// Inference. // Inference.
PaddleTensor input{ PaddleTensor input;
.name = "xx", input.shape = record.shape;
.shape = record.shape, input.data =
.data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
.dtype = PaddleDType::FLOAT32}; input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor"; VLOG(3) << "run executor";
std::vector<PaddleTensor> output; std::vector<PaddleTensor> output;
......
...@@ -44,7 +44,7 @@ class PaddleBuf { ...@@ -44,7 +44,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
...@@ -67,9 +67,9 @@ struct PaddleTensor { ...@@ -67,9 +67,9 @@ struct PaddleTensor {
PaddleTensor() = default; PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
std::vector<int> shape; std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf data; // blob of data. PaddleBuf data; // blob of data.
PaddleDType dtype; PaddleDType dtype;
std::vector<std::vector<uint64_t>> lod; // lod data
}; };
enum class PaddleEngineKind { enum class PaddleEngineKind {
...@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
// Configurations for Anakin engine. // Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config { struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device; int device;
std::string model_file; std::string model_file;
int max_batch_size{-1}; int max_batch_size{-1};
TargetType target_type;
}; };
struct TensorRTConfig : public NativeConfig { struct TensorRTConfig : public NativeConfig {
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc ...@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
...@@ -20,11 +20,60 @@ namespace tensorrt { ...@@ -20,11 +20,60 @@ namespace tensorrt {
class Conv2dOpConverter : public OpConverter { class Conv2dOpConverter : public OpConverter {
public: public:
Conv2dOpConverter() {}
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
LOG(INFO) LOG(INFO)
<< "convert a fluid conv2d op to tensorrt conv layer without bias"; << "convert a fluid conv2d op to tensorrt conv layer without bias";
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("Input").front());
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
const int n_output = Y_t->dims()[0];
const int filter_h = Y_t->dims()[2];
const int filter_w = Y_t->dims()[3];
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
const std::vector<int> dilations =
boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = TRT_ENGINE_ADD_LAYER(
engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
nv_ksize, weight.get(), bias.get());
PADDLE_ENFORCE(layer != nullptr);
layer->setStride(nv_strides);
layer->setPadding(nv_paddings);
layer->setDilation(nv_dilations);
layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front();
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class ElementwiseWeightOpConverter : public OpConverter {
public:
ElementwiseWeightOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front());
nvinfer1::Dims dims_x = X->getDimensions();
PADDLE_ENFORCE(dims_x.nbDims >= 3);
auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
}
if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
scale_mode = nvinfer1::ScaleMode::kCHANNEL;
} else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
dims_y[0] == dims_x.d[0]) {
scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
for (int i = 1; i < dims_x.nbDims; i++) {
if (dims_y[i] != dims_x.d[i]) {
scale_mode = nvinfer1::ScaleMode::kCHANNEL;
break;
}
}
if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
for (int i = 1; i < dims_x.nbDims; i++) {
if (dims_y[i] != 1)
PADDLE_THROW(
"TensorRT unsupported weight shape for Elementwise op!");
}
}
} else {
PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
}
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
class ElementwiseTensorOpConverter : public OpConverter {
public:
ElementwiseTensorOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front());
auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
nvinfer1::Dims dims_x = X->getDimensions();
nvinfer1::Dims dims_y = Y->getDimensions();
// The two input tensor should have the same dims
PADDLE_ENFORCE(dims_x.nbDims >= 3);
if (dims_x.nbDims == dims_y.nbDims) {
for (int i = 0; i < dims_x.nbDims; i++) {
if (dims_x.d[i] != dims_y.d[i])
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
} else {
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
auto op_pair = ops.find(op_type_);
if (op_pair == ops.end()) {
PADDLE_THROW("Wrong elementwise op type!");
}
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
protected:
static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
ops;
std::string op_type_;
};
const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
ElementwiseTensorOpConverter::ops = {
{"add", nvinfer1::ElementWiseOperation::kSUM},
{"mul", nvinfer1::ElementWiseOperation::kPROD},
{"sub", nvinfer1::ElementWiseOperation::kSUB},
{"div", nvinfer1::ElementWiseOperation::kDIV},
{"min", nvinfer1::ElementWiseOperation::kMIN},
{"pow", nvinfer1::ElementWiseOperation::kPOW},
{"max", nvinfer1::ElementWiseOperation::kMAX},
};
class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
};
class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
};
class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
};
class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
};
class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
};
class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
};
class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
ElementwiseTensorAddOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
ElementwiseTensorSubOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
ElementwiseTensorDivOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
ElementwiseTensorMulOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
ElementwiseTensorMaxOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
ElementwiseTensorMinOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
ElementwiseTensorPowOpConverter);
...@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, ...@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
} }
// indata c * k // indata c * k
// Reorder the data layout from CK to KC. // Reorder the data layout from CK to KC.
void ReorderCKtoKC(TensorRTEngine::Weight& iweights, void ReorderCKtoKC(TensorRTEngine::Weight& iweights, // NOLINT
TensorRTEngine::Weight* oweights) { TensorRTEngine::Weight* oweights) {
int c = iweights.dims[0]; int c = iweights.dims[0];
int k = iweights.dims[1]; int k = iweights.dims[1];
......
...@@ -55,6 +55,31 @@ class OpConverter { ...@@ -55,6 +55,31 @@ class OpConverter {
it = Registry<OpConverter>::Lookup("fc"); it = Registry<OpConverter>::Lookup("fc");
} }
} }
if (op_desc.Type().find("elementwise") != std::string::npos) {
static std::unordered_set<std::string> add_tensor_op_set{
"add", "mul", "sub", "div", "max", "min", "pow"};
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static std::unordered_set<std::string> add_weight_op_set{"add"};
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
int op_type_len = op_desc.Type().size();
std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
std::string Y = op_desc.Input("Y")[0];
if (parameters.count(Y)) {
PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type);
it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
} else {
PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type);
it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
}
}
if (!it) { if (!it) {
it = Registry<OpConverter>::Lookup(op_desc.Type()); it = Registry<OpConverter>::Lookup(op_desc.Type());
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(conv2d_op, test) {
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("conv2d");
desc.SetInput("Input", {"conv2d-X"});
desc.SetInput("Filter", {"conv2d-Y"});
desc.SetOutput("Output", {"conv2d-Out"});
const std::vector<int> strides({1, 1});
const std::vector<int> paddings({1, 1});
const std::vector<int> dilations({1, 1});
const int groups = 1;
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("dilations", dilations);
desc.SetAttr("groups", groups);
validator.SetOp(*desc.Proto());
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(conv2d);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(elementwise_op, add_weight_test) {
std::unordered_set<std::string> parameters({"elementwise_add-Y"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_add");
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"});
desc.SetOutput("Out", {"elementwise_add-Out"});
int axis = 1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(8);
}
TEST(elementwise_op, add_tensor_test) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(8, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_add");
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"});
desc.SetOutput("Out", {"elementwise_add-Out"});
// the defalut axis of elementwise op is -1
validator.SetOp(*desc.Proto());
validator.Execute(8);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
...@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) { ...@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
framework::ProgramDesc prog; framework::ProgramDesc prog;
auto* block = prog.MutableBlock(0); auto* block = prog.MutableBlock(0);
auto* conv2d_op = block->AppendOp(); auto* conv2d_op = block->AppendOp();
// init trt engine
cudaStream_t stream_;
std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3(2, 5, 5));
conv2d_op->SetType("conv2d"); conv2d_op->SetType("conv2d");
conv2d_op->SetInput("Input", {"conv2d-X"});
conv2d_op->SetInput("Filter", {"conv2d-Y"});
conv2d_op->SetOutput("Output", {"conv2d-Out"});
OpConverter converter; const std::vector<int> strides({1, 1});
const std::vector<int> paddings({1, 1});
const std::vector<int> dilations({1, 1});
const int groups = 1;
conv2d_op->SetAttr("strides", strides);
conv2d_op->SetAttr("paddings", paddings);
conv2d_op->SetAttr("dilations", dilations);
conv2d_op->SetAttr("groups", groups);
// init scope
framework::Scope scope; framework::Scope scope;
converter.ConvertBlock(*block->Proto(), {}, scope, std::vector<int> dim_vec = {3, 2, 3, 3};
nullptr /*TensorRTEngine*/); auto* x = scope.Var("conv2d-Y");
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
OpConverter converter;
converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
engine_.get() /*TensorRTEngine*/);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -149,7 +149,7 @@ class TRTConvertValidation { ...@@ -149,7 +149,7 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 2000; const size_t output_space_size = 3000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
......
...@@ -20,10 +20,10 @@ limitations under the License. */ ...@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_deterministic, true, DEFINE_bool(cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution " "Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If " "operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic."); "true, the algorithm is deterministic.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { if (input_grad) {
if (FLAGS_cudnn_deterministic) { if (!FLAGS_cudnn_deterministic) {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc, handle, cudnn_filter_desc,
...@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
if (filter_grad) { if (filter_grad) {
if (FLAGS_cudnn_deterministic) { if (!FLAGS_cudnn_deterministic) {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc, handle, cudnn_input_desc, cudnn_output_grad_desc,
......
...@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive( std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, return this->AcquireMemory(src_pd, user_pd, user_memory_p,
...@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive( std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
...@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive( std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
...@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive( std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p, const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
auto user_pd = user_weights_memory_p->get_primitive_desc(); auto user_pd = user_weights_memory_p->get_primitive_desc();
return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
...@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_pd_->src_primitive_desc(); auto src_pd = conv_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
...@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p, const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd, return this->AcquireMemory(weights_pd, user_weights_pd,
...@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
// Generate keys for storing/retriving primitives for this operator // Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial // TODO(jczaja): Make hashing function more optimial
static std::string GetHash(memory::dims& input_dims, static std::string GetHash(memory::dims& input_dims, // NOLINT
memory::dims& weights_dims, memory::dims& weights_dims, // NOLINT
std::vector<int>& strides, std::vector<int>& strides, // NOLINT
std::vector<int>& paddings, std::vector<int>& paddings, // NOLINT
std::vector<int>& dilations, int groups, std::vector<int>& dilations, // NOLINT
const std::string& suffix) { int groups, const std::string& suffix) {
return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
suffix; suffix;
......
...@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims_untrimed = y->dims();
auto z_dims = z->dims(); auto z_dims = z->dims();
// Execute default elementwise_add operator when // Execute default elementwise_add operator when
// broadcast operations need to performed. // broadcast operations need to performed.
if (x_dims != y_dims) { if (x_dims != y_dims_untrimed) {
auto sum_func = [](T a, T b) -> T { return a + b; }; auto sum_func = [](T a, T b) -> T { return a + b; };
TransformFunctor<decltype(sum_func), T, TransformFunctor<decltype(sum_func), T,
...@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::CPUDeviceContext>(), ctx.template device_context<paddle::platform::CPUDeviceContext>(),
sum_func); sum_func);
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(&y_dims); auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
...@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
"Wrong layout/format set for Y tensor"); "Wrong layout/format set for Y tensor");
std::vector<int> src_x_tz = framework::vectorize2int(x_dims); std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
std::vector<int> src_y_tz = framework::vectorize2int(y_dims); std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
std::vector<int> dst_tz = framework::vectorize2int(z_dims); std::vector<int> dst_tz = framework::vectorize2int(z_dims);
std::vector<memory::primitive_desc> srcs_pd; std::vector<memory::primitive_desc> srcs_pd;
...@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> { ...@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
// skip out, x, y,
// dout length is larger or equal than dx, dy.
auto* out = dout;
auto *x = dout, *y = dout;
auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
in->set_layout(DataLayout::kMKLDNN); in->set_layout(DataLayout::kMKLDNN);
in->set_format(out->format()); in->set_format(out->format());
}; };
if (x->dims() == y->dims()) { if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx); if (dx->dims() == dy->dims()) {
if (dx) { auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
blas.VCOPY(dout->numel(), dout->data<T>(), if (dx) {
dx->mutable_data<T>(ctx.GetPlace())); blas.VCOPY(dout->numel(), dout->data<T>(),
set_mkldnn_format(dx, dout); dx->mutable_data<T>(ctx.GetPlace()));
} set_mkldnn_format(dx, dout);
}
if (dy) {
blas.VCOPY(dout->numel(), dout->data<T>(), if (dy) {
dy->mutable_data<T>(ctx.GetPlace())); blas.VCOPY(dout->numel(), dout->data<T>(),
set_mkldnn_format(dy, dout); dy->mutable_data<T>(ctx.GetPlace()));
set_mkldnn_format(dy, dout);
}
} }
} else { } else {
// Execute default kernel when broadcast is needed // Execute default kernel when broadcast is needed
ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T, ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
IdentityGrad<T>, IdentityGrad<T>>( IdentityGrad<T>, IdentityGrad<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(), ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
IdentityGrad<T>()); IdentityGrad<T>());
} }
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_op.h" #include "paddle/fluid/operators/elementwise_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y"); REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
"X");
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_add, elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx, ...@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
framework::Tensor* dy) { framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>( ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
IdentityGrad<T>()); dx, dy, IdentityGrad<T>(),
IdentityGrad<T>());
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> { ...@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
// skip out, x, y
auto* out = dout;
auto *x = dout, *y = dout;
if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) { if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
dy != nullptr && (dx->dims() == dy->dims())) {
elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy); elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else { } else {
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise_op.h" #include "paddle/fluid/operators/elementwise_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() final { void Make() final {
AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("X", "(Tensor), The first input tensor of elementwise op.");
AddInput("Y", "(Tensor), The second input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
AddOutput("Out", "The output of elementwise op.").Reuse("X"); // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
// memory.").AsIntermediate();
AddOutput("Out", "The output of elementwise op.");
AddAttr<int>("axis", AddAttr<int>("axis",
"(int, default -1). The start dimension index " "(int, default -1). The start dimension index "
"for broadcasting Y onto X.") "for broadcasting Y onto X.")
...@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$. ...@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
)DOC", )DOC",
GetName(), GetEquation())); GetName(), GetEquation()));
SetReuse();
} }
protected: protected:
virtual std::string GetName() const = 0; virtual std::string GetName() const = 0;
virtual std::string GetEquation() const = 0; virtual std::string GetEquation() const = 0;
virtual void SetReuse() {}
}; };
class ElementwiseOpGrad : public framework::OperatorWithKernel { class ElementwiseOpGrad : public framework::OperatorWithKernel {
...@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto input_data_type = auto input_data_type = framework::ToDataType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()); ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (platform::CanMKLDNNBeUsed(ctx)) { if (platform::CanMKLDNNBeUsed(ctx)) {
...@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
}; };
// For Add, Sub op, the X, Out is not needed.
class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
public:
using operators::ElementwiseOpGrad::ElementwiseOpGrad;
using operators::ElementwiseOpGrad::GetExpectedKernelType;
using Tensor = framework::Tensor;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
ctx->SetOutputDim(x_grad_name, out_dims);
}
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(y_grad_name)) {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
auto y_dims = ctx->GetInputDim("Y");
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/*
*/
#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \
class kernel_type##GradMaker \
: public paddle::framework::SingleGradOpDescMaker { \
public: \
using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
\
protected: \
std::unique_ptr<paddle::framework::OpDesc> Apply() const override { \
auto* op = new paddle::framework::OpDesc(); \
op->SetType(#kernel_type "_grad"); \
op->SetInput("Y", Input("Y")); \
op->SetInput(::paddle::framework::GradVarName("Out"), \
OutputGrad("Out")); \
op->SetAttrMap(Attrs()); \
op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \
op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \
return std::unique_ptr<::paddle::framework::OpDesc>(op); \
} \
}
#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \ #define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \
class __ElemwiseOp##op_type##Maker__ \ class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \ : public ::paddle::operators::ElementwiseOpMaker { \
...@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
::paddle::operators::ElementwiseOpInferVarType, \ ::paddle::operators::ElementwiseOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \ ::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
virtual void SetReuse() { Reuse(__VA_ARGS__); } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims, ...@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
} }
} }
inline void trim_trailing_singular_dims(framework::DDim* dims) { inline framework::DDim trim_trailing_singular_dims(
const framework::DDim& dims) {
// Remove trailing dimensions of size 1 for y // Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims->size(); auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) { for (; actual_dims_size != 0; --actual_dims_size) {
if ((*dims)[actual_dims_size - 1] != 1) break; if (dims[actual_dims_size - 1] != 1) break;
} }
if (actual_dims_size != dims->size()) {
auto actual_dims = framework::vectorize(*dims); std::vector<int> trim_dims;
actual_dims.resize(actual_dims_size); trim_dims.resize(actual_dims_size);
*dims = framework::make_ddim(actual_dims); for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
} }
framework::DDim actual_dims = framework::make_ddim(trim_dims);
return actual_dims;
} }
template <typename T, typename DeviceContext> template <typename T, typename DeviceContext>
...@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, ...@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
#endif #endif
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
void ElemwiseGradComputeNoBroadcast(
const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
const framework::DDim& y_dim, const framework::Tensor& x,
const framework::Tensor& y, const framework::Tensor& out,
const framework::Tensor& dout, int axis, framework::Tensor* dx,
framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
size_t N = static_cast<size_t>(framework::product(x_dim));
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), N);
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
}
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
void ElemwiseGradComputeWithBroadcast(
const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
const framework::DDim& y_dim_untrimed, const framework::Tensor& x,
const framework::Tensor& y, const framework::Tensor& out,
const framework::Tensor& dout, int axis, framework::Tensor* dx,
framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
if (post == 1) {
int h = pre;
int w = n;
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
ElemwiseGradBroadcast1CUDA(
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
#endif
} else {
ElemwiseGradBroadcast1CPU(
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
}
} else {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
ElemwiseGradBroadcast2CUDA(
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
#endif
} else {
ElemwiseGradBroadcast2CPU(
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
}
}
}
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP> template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
void ElemwiseGradCompute(const framework::ExecutionContext& ctx, void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor& x, const framework::Tensor& y, const framework::Tensor& x, const framework::Tensor& y,
...@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor& dout, int axis, const framework::Tensor& dout, int axis,
framework::Tensor* dx, framework::Tensor* dy, framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) { DX_OP dx_op, DY_OP dy_op) {
const framework::DDim x_dim = x.dims();
const framework::DDim y_dim = y.dims();
if (x.dims() == y.dims()) { if (x.dims() == y.dims()) {
size_t N = static_cast<size_t>(framework::product(x.dims())); ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
platform::ForRange<DeviceContext> for_range( ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
ctx.template device_context<DeviceContext>(), N);
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
} else { // Y is a scalar } else { // Y is a scalar
auto x_dim = x.dims(); ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
auto y_dim = y.dims(); ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); }
trim_trailing_singular_dims(&y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis; // NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
int pre, n, post; // In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); // elementwise code.
if (post == 1) { template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
int h = pre; void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
int w = n; const framework::Tensor& x,
if (platform::is_gpu_place(ctx.GetPlace())) { const framework::Tensor& y,
#ifdef __NVCC__ const framework::Tensor& out,
ElemwiseGradBroadcast1CUDA( const framework::Tensor& dout, int axis,
ctx.template device_context<DeviceContext>().stream(), x.data<T>(), framework::Tensor* dx, framework::Tensor* dy,
y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op, DX_OP dx_op, DY_OP dy_op) {
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), if (dy == nullptr) {
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); const framework::DDim dx_dims = dout.dims();
#endif auto dy_dims = dx_dims;
} else { ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ElemwiseGradBroadcast1CPU( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, } else {
dx_op, dy_op, if (dout.dims() == dy->dims()) {
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), const framework::DDim dx_dims = dout.dims();
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); const framework::DDim dy_dims = dy->dims();
} ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
} else { ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
if (platform::is_gpu_place(ctx.GetPlace())) { } else { // Y is a scalar
#ifdef __NVCC__ auto dx_dims = dout.dims();
ElemwiseGradBroadcast2CUDA( const framework::DDim dy_dims = dy->dims();
ctx.template device_context<DeviceContext>().stream(), x.data<T>(), ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op, ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
#endif
} else {
ElemwiseGradBroadcast2CPU(
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
post, dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
}
} }
} }
} }
// Deprecated
template <typename DeviceContext, typename T, typename functor, template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor> typename broadcastfunctor, typename broadcast2functor>
void ElementwiseGradCompute(const framework::ExecutionContext& ctx, void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
...@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
} }
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(&y_dims); trim_trailing_singular_dims(y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
...@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, ...@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
x, y, z, ctx.template device_context<DeviceContext>(), func); x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
"Rank of first input must >= rank of second input."); "Rank of first input must >= rank of second input.");
if (x_dims == y_dims) { if (x_dims == y_dims_untrimed) {
functor.Run(); functor.Run();
return; return;
} }
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(&y_dims); auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
......
...@@ -15,7 +15,10 @@ limitations under the License. */ ...@@ -15,7 +15,10 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise_op.h" #include "paddle/fluid/operators/elementwise_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y"); REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
"X");
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> { ...@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>( // skip out, x, y
auto* out = dout;
auto *x = dout, *y = dout;
ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>()); ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
} }
}; };
......
...@@ -19,12 +19,17 @@ limitations under the License. */ ...@@ -19,12 +19,17 @@ limitations under the License. */
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_int32(listen_and_serv_profile_period, 0,
"the period of listen_and_serv to do profile");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop( ...@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop(
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr)); std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
int32_t profile_step = 0;
while (true) { while (true) {
PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period,
"profile_step should not be larger then "
"FLAGS_listen_and_serv_profile_period");
if (FLAGS_listen_and_serv_profile_period > 0) {
if (profile_step == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
}
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(distributed::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
...@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop( ...@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
// reset received sparse vars to avoid reuse it in the next mini-batch // reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get()) dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder(); ->ResetSparseVarRecorder();
if (FLAGS_listen_and_serv_profile_period > 0) {
if (profile_step == FLAGS_listen_and_serv_profile_period) {
paddle::platform::DisableProfiler(
paddle::platform::EventSortingKey::kTotal, "/dev/null");
profile_step = 0;
} else {
profile_step++;
}
}
} // while(true) } // while(true)
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5); PADDLE_ENFORCE(col->dims().size() == 5);
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int output_height = col->dims()[3];
int output_width = col->dims()[4];
int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
T* col_data = col->data<T>();
// TODO(TJ): change me to template
// further optimaze:
// 1. padding != 1
// 2. could also support stride_h != 1
if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { dilation[1] == 1) {
int col_matrix_width = output_width * output_height; if (padding[0] == 0 && padding[1] == 0) {
size_t copy_size = sizeof(T) * output_width; im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
for (int oh = 0; oh < output_height; ++oh) { return;
const T* im_data_start = im_data + oh * im_width; } else if (padding[0] == 1 && padding[1] == 1) {
T* dst_data = col_data + oh * output_width; im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
for (int ic = 0; ic < im_channels; ++ic) { return;
const T* src_data = im_data_start + ic * im_height * im_width;
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
std::memcpy(dst_data, src_data + kw, copy_size);
dst_data = dst_data + col_matrix_width;
}
src_data = src_data + im_width;
}
}
}
return;
}
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / (filter_width * filter_height);
for (int h = 0; h < output_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < output_width; ++w) {
int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * output_height + h) * output_width + w;
int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<T>(0)
: im_data[im_idx];
}
} }
// TODO(TJ): complete padding >=2
} }
im2col_common<T>(im, dilation, stride, padding, col);
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
/**
* The most common im2col algorithm.
* Support dilation, stride and padding.
*/
template <typename T>
inline void im2col_common(const framework::Tensor& im,
const std::vector<int>& dilation,
const std::vector<int>& stride,
const std::vector<int>& padding,
framework::Tensor* col) {
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int output_height = col->dims()[3];
int output_width = col->dims()[4];
int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / (filter_width * filter_height);
for (int h = 0; h < output_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < output_width; ++w) {
int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * output_height + h) * output_width + w;
int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<T>(0)
: im_data[im_idx];
}
}
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 0
*/
template <typename T>
inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
framework::Tensor* col) {
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int output_height = col->dims()[3];
int output_width = col->dims()[4];
const T* im_data = im.data<T>();
T* col_data = col->data<T>();
int col_matrix_width = output_width * output_height;
int im_size = im_height * im_width;
size_t copy_size = sizeof(T) * output_width;
const T* im_data_oh = im_data;
T* dst_data_oh = col_data;
for (int oh = 0; oh < output_height; ++oh) {
const T* src_data_ic = im_data_oh;
T* dst_data = dst_data_oh;
for (int ic = 0; ic < im_channels; ++ic) {
const T* src_data = src_data_ic;
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
std::memcpy(dst_data, src_data + kw, copy_size);
dst_data = dst_data + col_matrix_width;
}
src_data = src_data + im_width;
}
src_data_ic = src_data_ic + im_size;
}
im_data_oh = im_data_oh + im_width;
dst_data_oh = dst_data_oh + output_width;
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 1
* and filter_width == 1 have a special implementation
*/
template <typename T>
inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
framework::Tensor* col) {
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int output_height = col->dims()[3];
int output_width = col->dims()[4];
constexpr int plh = 1;
constexpr int prh = 1;
constexpr int plw = 1;
constexpr int prw = 1;
const T* im_data = im.data<T>();
T* col_data = col->data<T>();
int im_size = im_height * im_width;
int col_matrix_width = output_width * output_height;
int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow
int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow
// fill height padding
{
size_t copy_size = sizeof(T) * output_width;
T* col_start_l = col_data;
T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
col_matrix_width - output_width;
for (int ic = 0; ic < im_channels; ++ic) {
T* dst_data_l = col_start_l;
T* dst_data_r = col_start_r;
for (int kw = 0; kw < filter_width; ++kw) {
std::memset(dst_data_l, 0, copy_size);
std::memset(dst_data_r, 0, copy_size);
dst_data_l = dst_data_l + col_matrix_width;
dst_data_r = dst_data_r + col_matrix_width;
}
col_start_l = col_start_l + col_block_ic;
col_start_r = col_start_r + col_block_ic;
}
}
auto pad = static_cast<T>(0);
if (filter_width == 1) {
// fill width padding
T* dst_data_ic = col_data;
for (int ic = 0; ic < im_channels; ++ic) {
T* dst_data_kh = dst_data_ic;
for (int kh = 0; kh < filter_height; ++kh) {
T* dst_data = dst_data_kh;
for (int oh = 0; oh < output_height; ++oh) {
*dst_data = pad;
dst_data = dst_data + output_width - 1;
*dst_data = pad;
++dst_data;
}
dst_data_kh = dst_data_kh + col_block_fh;
}
dst_data_ic = dst_data_ic + col_block_ic;
}
// fill core
size_t copy_size = sizeof(T) * (output_width - plw - prw);
for (int oh = 0; oh < output_height; ++oh) {
const T* im_data_start =
im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
T* dst_data = col_data + oh * output_width;
for (int ic = 0; ic < im_channels; ++ic) {
const T* src_data = im_data_start + ic * im_size;
for (int kh = 0; kh < filter_height; ++kh) {
if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
kh > (filter_height - prh - 1))) {
dst_data = dst_data + col_matrix_width;
continue;
}
std::memcpy(dst_data + plw, src_data, copy_size);
dst_data = dst_data + col_matrix_width;
src_data = src_data + im_width;
}
}
}
return;
}
// filter_width != 1
// fill width padding
T* dst_data_ic = col_data;
for (int ic = 0; ic < im_channels; ++ic) {
T* dst_data_kh = dst_data_ic;
for (int kh = 0; kh < filter_height; ++kh) {
for (T* dst_data :
{dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
output_width - 1}) {
// TODO(TJ): from plh, saving repeated assignment
for (int oh = 0; oh < output_height; ++oh) {
*dst_data = pad;
dst_data = dst_data + output_width;
}
}
dst_data_kh = dst_data_kh + col_block_fh;
}
dst_data_ic = dst_data_ic + col_block_ic;
}
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
for (int oh = 0; oh < output_height; ++oh) {
const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
T* dst_data = col_data + oh * output_width;
for (int ic = 0; ic < im_channels; ++ic) {
const T* src_data = im_data_start + ic * im_size;
for (int kh = 0; kh < filter_height; ++kh) {
if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
kh > (filter_height - prh - 1))) {
dst_data = dst_data + filter_width * col_matrix_width;
continue;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for (int kw = 0; kw < plw; ++kw) {
std::memcpy(dst_data + (plw - kw), src_data,
sizeof(T) * (output_width - (plw - kw)));
dst_data = dst_data + col_matrix_width;
}
for (int kw = plw; kw < filter_width - prw; ++kw) {
std::memcpy(dst_data, src_data + (kw - plw),
sizeof(T) * output_width);
dst_data = dst_data + col_matrix_width;
}
int i = 1;
for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
std::memcpy(dst_data, src_data + (kw - plw),
sizeof(T) * (output_width - i));
dst_data = dst_data + col_matrix_width;
}
src_data = src_data + im_width;
}
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <sys/time.h>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testIm2col() { void testIm2col() {
...@@ -160,82 +162,111 @@ void testIm2col() { ...@@ -160,82 +162,111 @@ void testIm2col() {
delete context; delete context;
} }
void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
paddle::framework::Tensor input;
paddle::framework::Tensor output;
paddle::framework::Tensor ref_output;
std::vector<int> padding({ph, pw});
std::vector<int> stride({1, 1}); // stride_y, stride_x
std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;
int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;
float* input_ptr =
input.mutable_data<float>({ic, ih, iw}, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) {
input_ptr[i] = static_cast<float>(i + 1);
}
paddle::platform::CPUPlace place;
paddle::platform::CPUDeviceContext context(place);
output.mutable_data<float>({ic, fh, fw, output_height, output_width}, place);
ref_output.mutable_data<float>({ic, fh, fw, output_height, output_width},
place);
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CPUDeviceContext, float>
im2col;
im2col(context, input, dilation, stride, padding, &output);
auto ref_im2col = [&](
const paddle::framework::Tensor& im, const std::vector<int>& dilation,
const std::vector<int>& stride, const std::vector<int>& padding,
paddle::framework::Tensor* col) {
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int output_height = col->dims()[3];
int output_width = col->dims()[4];
int channels_col = im_channels * filter_height * filter_width;
const float* im_data = im.data<float>();
float* col_data = col->data<float>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / (filter_width * filter_height);
for (int h = 0; h < output_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < output_width; ++w) {
int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * output_height + h) * output_width + w;
int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width)
? 0.f
: im_data[im_idx];
}
}
}
};
ref_im2col(input, dilation, stride, padding, &ref_output);
float* out_cfo_ptr = output.data<float>();
float* out_ref_ptr = ref_output.data<float>();
for (int i = 0; i < output.numel(); ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]);
}
}
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0,
/*pw*/ 0);
testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
/*pw*/ 1);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testIm2col<paddle::platform::CUDADeviceContext, testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>(); paddle::platform::CUDAPlace>();
#endif #endif
} }
#define PREPARE_IM2COL_CPU \
paddle::platform::CPUPlace place; \
paddle::platform::CPUDeviceContext context(place); \
paddle::framework::Tensor input; \
paddle::framework::Tensor out; \
paddle::framework::Tensor ref; \
std::vector<int> padding({ph, pw}); \
std::vector<int> stride({1, 1}); \
std::vector<int> dilation({1, 1}); \
float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place); \
for (int i = 0; i < input.numel(); ++i) { \
input_ptr[i] = static_cast<float>(i + 1); \
} \
int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \
int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \
out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
paddle::operators::math::Im2ColFunctor< \
paddle::operators::math::ColFormat::kCFO, \
paddle::platform::CPUDeviceContext, float> \
im2col
void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
PREPARE_IM2COL_CPU;
im2col(context, input, dilation, stride, padding, &out);
paddle::operators::math::im2col_common<float>(input, dilation, stride,
padding, &ref);
float* ref_data = ref.data<float>();
float* out_data = out.data<float>();
for (int i = 0; i < out.numel(); ++i) {
EXPECT_EQ(out_data[i], ref_data[i]);
}
}
void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
PREPARE_IM2COL_CPU;
constexpr int repeat = 100;
auto GetCurrentMs = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
};
auto t1 = GetCurrentMs();
for (int i = 0; i < repeat; ++i) {
im2col(context, input, dilation, stride, padding, &out);
}
auto t2 = GetCurrentMs();
for (int i = 0; i < repeat; ++i) {
paddle::operators::math::im2col_common<float>(input, dilation, stride,
padding, &ref);
}
auto t3 = GetCurrentMs();
LOG(INFO) << "before: " << (t3 - t2) / repeat
<< ",after: " << (t2 - t1) / repeat
<< ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
}
TEST(math, im2col_cputest) {
// padding_h == padding_w
for (int p = 0; p < 4; ++p) {
// width == height
testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
/*pw*/ p);
testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
/*pw*/ p);
testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
/*pw*/ p);
// height != width
testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
/*pw*/ p);
testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
/*pw*/ p);
testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
/*pw*/ p);
// filter == 1
testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
/*pw*/ p);
testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
/*pw*/ p);
}
// padding_h != padding_w
testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
/*pw*/ 2);
// benchmark
for (int p : {0, 1}) {
for (int k : {1, 3, 5}) {
LOG(INFO) << "padding == " << p << ", filter == " << k;
benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
/*ph*/ p, /*pw*/ p);
}
}
}
...@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor). The output tensor of reshape operator."); AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"shape", "(std::vector<int>) Target shape of reshape operator."); "shape", "(std::vector<int>) Target shape of reshape operator.");
AddAttr<bool>("inplace",
"(default: false) Change the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x).")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Reshape Operator. Reshape Operator.
...@@ -233,16 +227,9 @@ class ReshapeKernel { ...@@ -233,16 +227,9 @@ class ReshapeKernel {
"sequence_reshape op."); "sequence_reshape op.");
} }
bool inplace = ctx.Attr<bool>("inplace"); out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopySync(*in, ctx.GetPlace(), out);
out->Resize(out_dims); out->Resize(out_dims);
if (!inplace) {
out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopySync(*in, ctx.GetPlace(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
out->Resize(out_dims);
}
} }
}; };
...@@ -251,19 +238,11 @@ class ReshapeGradKernel { ...@@ -251,19 +238,11 @@ class ReshapeGradKernel {
void operator()(const framework::ExecutionContext &ctx) const { void operator()(const framework::ExecutionContext &ctx) const {
auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X")); auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto in_dims = d_x->dims();
d_x->mutable_data(ctx.GetPlace(), d_out->type()); d_x->mutable_data(ctx.GetPlace(), d_out->type());
bool inplace = ctx.Attr<bool>("inplace"); framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
d_x->Resize(in_dims);
auto in_dims = d_x->dims();
if (!inplace) {
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
ctx.device_context().Wait();
d_x->Resize(in_dims);
} else {
d_x->ShareDataWith(*d_out);
d_x->Resize(in_dims);
}
} }
}; };
......
...@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
ctx->GetInputDim(framework::GradVarName("Out")), ctx->GetInputDim(framework::GradVarName("Out")),
"Input(Out) and its gradients should have a same shape."); "Input(Out) and its gradients should have a same shape.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"),
ctx->GetInputDim(framework::GradVarName("Out")));
} }
protected: protected:
...@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
layout_ = framework::DataLayout::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
auto input_data_type = auto input_data_type = framework::ToDataType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()); ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"float16 can only be used on GPU place"); "float16 can only be used on GPU place");
...@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
} }
}; };
class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("softmax_grad");
op->SetInput("Out", Output("Out"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::SoftmaxOpGradMaker);
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>, softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
...@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
const auto &ids_rows = ids_selected_rows->rows(); const auto &ids_rows = ids_selected_rows->rows();
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out"); auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
const size_t shard_num = outs.size(); const size_t shard_num = outs.size();
for (auto &out : outs) {
out->mutable_rows()->clear();
}
// get rows for outputs // get rows for outputs
for (auto &id : ids_rows) { std::unordered_map<int64_t, size_t> id_to_index;
size_t shard_id = static_cast<size_t>(id) % shard_num; for (size_t i = 0; i < ids_rows.size(); ++i) {
outs[shard_id]->mutable_rows()->push_back(id); id_to_index[ids_rows[i]] = i;
size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
} }
int64_t row_width = ids_dims[1]; int64_t row_width = ids_dims[1];
...@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
{static_cast<int64_t>(out->rows().size()), row_width}); {static_cast<int64_t>(out->rows().size()), row_width});
T *output = out->mutable_value()->mutable_data<T>(ddim, place); T *output = out->mutable_value()->mutable_data<T>(ddim, place);
for (int64_t i = 0; i < ddim[0]; ++i) { for (int64_t i = 0; i < ddim[0]; ++i) {
memcpy(output + i * row_width, ids + out->rows()[i] * row_width, memcpy(output + i * row_width,
ids + id_to_index[out->rows()[i]] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
} }
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <bitset>
#include <iostream> #include <iostream>
#include <random> #include <random>
...@@ -25,13 +24,13 @@ ...@@ -25,13 +24,13 @@
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using paddle::platform::float16; using paddle::platform::float16;
#define CUDA_ATOMIC_KERNEL(op, T) \ template <typename T>
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) { \ i += blockDim.x * gridDim.x) {
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
} \
} }
}
template <typename T> template <typename T>
struct AddFunctor { struct AddFunctor {
...@@ -39,80 +38,116 @@ struct AddFunctor { ...@@ -39,80 +38,116 @@ struct AddFunctor {
}; };
template <typename T> template <typename T>
struct SubFunctor { void TestCase(size_t num) {
T operator()(const T& a, const T& b) { return a - b; } T *in1, *in2, *out;
}; T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
// NOTE(dzhwinter): the float16 add has small underflow/overflow cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
// so we use EXPECT_NEAR to check the result. cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ in1 = reinterpret_cast<T*>(malloc(size));
void Test##T##op(size_t num) { \ in2 = reinterpret_cast<T*>(malloc(size));
T *in1, *in2, *out; \ out = reinterpret_cast<T*>(malloc(size));
T *d_in1, *d_in2; \ std::minstd_rand engine;
size_t size = sizeof(T) * num; \ std::uniform_real_distribution<double> dist(0.0, 1.0);
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ for (size_t i = 0; i < num; ++i) {
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ in1[i] = static_cast<T>(dist(engine));
in1 = reinterpret_cast<T*>(malloc(size)); \ in2[i] = static_cast<T>(dist(engine));
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
} }
CUDA_ATOMIC_KERNEL(Add, float); cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
CUDA_ATOMIC_KERNEL(Add, double); cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
CUDA_ATOMIC_KERNEL(Add, float16); AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaDeviceSynchronize();
ARITHMETIC_KERNEL_LAUNCH(Add, float); cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
ARITHMETIC_KERNEL_LAUNCH(Add, double); cudaDeviceSynchronize();
ARITHMETIC_KERNEL_LAUNCH(Add, float16); for (size_t i = 0; i < num; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
namespace paddle { // so we use EXPECT_NEAR to check the result.
namespace platform { EXPECT_NEAR(static_cast<float>(out[i]),
USE_CUDA_ATOMIC(Sub, int); static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
}; }
}; free(in1);
CUDA_ATOMIC_KERNEL(Sub, int); free(in2);
ARITHMETIC_KERNEL_LAUNCH(Sub, int); free(out);
cudaFree(d_in1);
cudaFree(d_in2);
}
// cuda primitives // cuda primitives
TEST(CudaAtomic, Add) { TEST(CudaAtomic, Add) {
TestfloatAdd(static_cast<size_t>(10)); TestCase<float>(static_cast<size_t>(10));
TestfloatAdd(static_cast<size_t>(1024 * 1024)); TestCase<float>(static_cast<size_t>(1024 * 1024));
TestdoubleAdd(static_cast<size_t>(10));
TestdoubleAdd(static_cast<size_t>(1024 * 1024));
}
TEST(CudaAtomic, Sub) { TestCase<double>(static_cast<size_t>(10));
TestintSub(static_cast<size_t>(10)); TestCase<double>(static_cast<size_t>(1024 * 1024));
TestintSub(static_cast<size_t>(1024 * 1024));
} }
TEST(CudaAtomic, float16) { TEST(CudaAtomic, float16) {
using paddle::platform::float16; TestCase<float16>(static_cast<size_t>(1));
Testfloat16Add(static_cast<size_t>(1)); TestCase<float16>(static_cast<size_t>(2));
Testfloat16Add(static_cast<size_t>(2)); TestCase<float16>(static_cast<size_t>(3));
Testfloat16Add(static_cast<size_t>(3));
TestCase<float16>(static_cast<size_t>(10));
TestCase<float16>(static_cast<size_t>(1024 * 1024));
}
// unalignment of uint8
void TestUnalign(size_t num, const int shift_bit) {
PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
float16 *in1, *in2, *out;
float16 *d_in1, *d_in2;
size_t size = sizeof(uint8_t) * (num + shift_bit);
size_t array_size = sizeof(float16) * (num / 2);
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
in1 = reinterpret_cast<float16*>(malloc(size));
in2 = reinterpret_cast<float16*>(malloc(size));
out = reinterpret_cast<float16*>(malloc(size));
// right shift 1, mimic the unalignment of address
float16* r_in1 =
reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
float16* r_in2 =
reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (size_t i = 0; i < num / 2; ++i) {
r_in1[i] = static_cast<float16>(dist(engine));
r_in2[i] = static_cast<float16>(dist(engine));
}
cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(out[i]),
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
0.001);
}
free(in1);
free(in2);
free(out);
cudaFree(d_in1);
cudaFree(d_in2);
}
TEST(CudaAtomic, float16Unalign) {
// same with float16 testcase
TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
// shift the address.
TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
Testfloat16Add(static_cast<size_t>(10)); TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
Testfloat16Add(static_cast<size_t>(1024 * 1024)); TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
} }
...@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic. // convert the value into float and do the add arithmetic.
// then store the result into a uint32. // then store the result into a uint32.
inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
float16 low_half; float16 low_half;
// the float16 in lower 16bits // the float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xffffu); low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(static_cast<float>(low_half) + x); low_half = static_cast<float16>(static_cast<float>(low_half) + x);
return (val & 0xffff0000u) | low_half.x; return (val & 0xFFFF0000u) | low_half.x;
} }
inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
float16 high_half; float16 high_half;
// the float16 in higher 16bits // the float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16); high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(static_cast<float>(high_half) + x); high_half = static_cast<float16>(static_cast<float>(high_half) + x);
return (val & 0xffffu) | (static_cast<uint32_t>(high_half.x) << 16); return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
} }
CUDA_ATOMIC_WRAPPER(Add, float16) { CUDA_ATOMIC_WRAPPER(Add, float16) {
// concrete packed float16 value may exsits in lower or higher 16bits // concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address. // of the 32bits address.
uint32_t *address_as_ui = uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<uint32_t *>(reinterpret_cast<char *>(address) - reinterpret_cast<char *>(address) -
(reinterpret_cast<size_t>(address) & 2)); (reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val); float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui; uint32_t old = *address_as_ui;
uint32_t sum; uint32_t sum;
uint32_t newval; uint32_t newval;
uint32_t assumed; uint32_t assumed;
if (((size_t)address & 2) == 0) { if (((uintptr_t)address & 0x02) == 0) {
// the float16 value stay at lower 16 bits of the address. // the float16 value stay at lower 16 bits of the address.
do { do {
assumed = old; assumed = old;
old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
} while (old != assumed); } while (old != assumed);
float16 ret; float16 ret;
ret.x = old & 0xffffu; ret.x = old & 0xFFFFu;
return ret; return ret;
} else { } else {
// the float16 value stay at higher 16 bits of the address. // the float16 value stay at higher 16 bits of the address.
......
...@@ -534,7 +534,7 @@ EOF ...@@ -534,7 +534,7 @@ EOF
make -j `nproc` inference_lib_dist make -j `nproc` inference_lib_dist
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
cp -r fluid_install_dir fluid cp -r fluid_install_dir fluid
tar -cf fluid.tgz fluid tar -czf fluid.tgz fluid
fi fi
} }
......
...@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False): ...@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
if drop_last == False and len(b) != 0: if drop_last == False and len(b) != 0:
yield b yield b
# Batch size check
batch_size = int(batch_size)
if batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(batch_size))
return batch_reader return batch_reader
...@@ -127,6 +127,7 @@ def __bootstrap__(): ...@@ -127,6 +127,7 @@ def __bootstrap__():
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('listen_and_serv_profile_period')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -38,6 +38,7 @@ __all__ = [ ...@@ -38,6 +38,7 @@ __all__ = [
__auto__ = [ __auto__ = [
'iou_similarity', 'iou_similarity',
'box_coder', 'box_coder',
'polygon_box_transform',
] ]
__all__ += __auto__ __all__ += __auto__
......
...@@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension.") "except one unknown dimension.")
helper = LayerHelper("reshape", **locals()) helper = LayerHelper("reshape", **locals())
reshaped = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape", type="reshape",
inputs=inputs, inputs=inputs,
attrs={"shape": shape, attrs={"shape": shape},
"inplace": inplace}, outputs={"Out": out})
outputs={"Out": reshaped})
return helper.append_activation(reshaped) return helper.append_activation(out)
def lod_reset(x, y=None, target_lod=None): def lod_reset(x, y=None, target_lod=None):
......
...@@ -66,9 +66,7 @@ __all__ = [ ...@@ -66,9 +66,7 @@ __all__ = [
'scatter', 'scatter',
'sum', 'sum',
'slice', 'slice',
'polygon_box_transform',
'shape', 'shape',
'iou_similarity',
'maxout', 'maxout',
] + __activations__ ] + __activations__
......
...@@ -122,7 +122,7 @@ class ParallelExecutor(object): ...@@ -122,7 +122,7 @@ class ParallelExecutor(object):
else: else:
cpu_num = int( cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num exec_strategy.num_threads = cpu_num * 2
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
......
...@@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE) ...@@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
import os
import sys
import transformer_model
import paddle.dataset.wmt16 as wmt16
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size = 10000
# index for <pad> token in source language.
src_pad_idx = src_vocab_size
# size of target word dictionay
trg_vocab_size = 10000
# index for <pad> token in target language.
trg_pad_idx = trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx = 0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length = 50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 1024
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def __pad_batch_data(insts,
pad_idx,
is_target=False,
return_pos=True,
return_attn_bias=True,
return_max_len=True):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
inst_data = np.array(
[inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos:
inst_pos = np.array([[
pos_i + 1 if w_i != pad_idx else 0
for pos_i, w_i in enumerate(inst)
] for inst in inst_data])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
if return_attn_bias:
if is_target:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
[-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data,
[1, n_head, 1, 1]) * [-1e9]
else:
# This is used to avoid attention on paddings.
slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
(max_len - len(inst))
for inst in insts])
slf_attn_bias_data = np.tile(
slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
[1, n_head, max_len, 1])
return_list += [slf_attn_bias_data.astype("float32")]
if return_max_len:
return_list += [max_len]
return return_list if len(return_list) > 1 else return_list[0]
src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, is_target=False)
trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
[inst[1] for inst in insts], trg_pad_idx, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32")
lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
return [
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
]
def transformer(use_feed):
assert not use_feed, "transfomer doesn't support feed yet"
return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
def get_model():
avg_cost = transformer(use_feed=False)
optimizer = fluid.optimizer.Adam()
optimizer.minimize(avg_cost)
return avg_cost
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
class DistTransformer2x2(object):
def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
trainer_id):
get_model()
t = get_transpiler(trainer_id,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
print("waiting ps ready: ", pid)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
avg_cost = get_model()
if is_dist:
t = get_transpiler(trainer_id,
fluid.default_main_program(), endpoints,
trainers)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
exe = fluid.ParallelExecutor(
True, loss_name=avg_cost.name, exec_strategy=strategy)
first_loss, = exe.run(fetch_list=[avg_cost.name])
print(first_loss)
for i in xrange(5):
_ = exe.run(fetch_list=[avg_cost.name])
last_loss, = exe.run(fetch_list=[avg_cost.name])
print(last_loss)
def main(role="pserver",
endpoints="127.0.0.1:9123",
trainer_id=0,
current_endpoint="127.0.0.1:9123",
trainers=1,
is_dist=True):
reader = paddle.batch(
wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=transformer_model.batch_size)
with fluid.recordio_writer.create_recordio_writer(
WMT16_RECORDIO_FILE) as writer:
for batch in reader():
for tensor in prepare_batch_input(
batch, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
t = fluid.LoDTensor()
t.set(tensor, fluid.CPUPlace())
writer.append_tensor(t)
writer.complete_append_tensor()
model = DistTransformer2x2()
if role == "pserver":
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
if __name__ == "__main__":
if len(sys.argv) != 7:
print(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role = sys.argv[1]
endpoints = sys.argv[2]
trainer_id = int(sys.argv[3])
current_endpoint = sys.argv[4]
trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False
main(
role=role,
endpoints=endpoints,
trainer_id=trainer_id,
current_endpoint=current_endpoint,
trainers=trainers,
is_dist=is_dist)
...@@ -68,6 +68,10 @@ def get_numeric_gradient(place, ...@@ -68,6 +68,10 @@ def get_numeric_gradient(place,
tensor_to_check_dtype = np.float32 tensor_to_check_dtype = np.float32
elif tensor_to_check_dtype == core.VarDesc.VarType.FP64: elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
tensor_to_check_dtype = np.float64 tensor_to_check_dtype = np.float64
elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
tensor_to_check_dtype = np.float16
# set delta as np.float16, will automatic convert to float32, float64
delta = np.array(delta).astype(np.float16)
else: else:
raise ValueError("Not supported data type " + str( raise ValueError("Not supported data type " + str(
tensor_to_check_dtype)) tensor_to_check_dtype))
...@@ -75,13 +79,24 @@ def get_numeric_gradient(place, ...@@ -75,13 +79,24 @@ def get_numeric_gradient(place,
gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype) gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
def __get_elem__(tensor, i): def __get_elem__(tensor, i):
if tensor_to_check_dtype == np.float32: if tensor_to_check_dtype == np.float16:
numpy_tensor = np.array(tensor).astype(np.float16)
numpy_tensor = numpy_tensor.flatten()
return numpy_tensor[i]
elif tensor_to_check_dtype == np.float32:
return tensor._get_float_element(i) return tensor._get_float_element(i)
else: else:
return tensor._get_double_element(i) return tensor._get_double_element(i)
def __set_elem__(tensor, i, e): def __set_elem__(tensor, i, e):
if tensor_to_check_dtype == np.float32: if tensor_to_check_dtype == np.float16:
numpy_tensor = np.array(tensor).astype(np.float16)
shape = numpy_tensor.shape
numpy_tensor = numpy_tensor.flatten()
numpy_tensor[i] = e
numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
tensor.set(numpy_tensor, place)
elif tensor_to_check_dtype == np.float32:
tensor._set_float_element(i, e) tensor._set_float_element(i, e)
else: else:
tensor._set_double_element(i, e) tensor._set_double_element(i, e)
...@@ -135,6 +150,11 @@ class OpTest(unittest.TestCase): ...@@ -135,6 +150,11 @@ class OpTest(unittest.TestCase):
if not self.call_once: if not self.call_once:
self.call_once = True self.call_once = True
self.dtype = data_type self.dtype = data_type
# See the comment of np_dtype_to_fluid_dtype
# If the input type is uint16, we assume use float16
# for lodtensor dtype.
if self.dtype == np.uint16:
self.dtype == np.float16
def infer_dtype_from_inputs_outputs(self, inputs, outputs): def infer_dtype_from_inputs_outputs(self, inputs, outputs):
def infer_dtype(numpy_dict): def infer_dtype(numpy_dict):
...@@ -163,19 +183,25 @@ class OpTest(unittest.TestCase): ...@@ -163,19 +183,25 @@ class OpTest(unittest.TestCase):
for name, np_value in self.inputs[var_name]: for name, np_value in self.inputs[var_name]:
tensor = core.LoDTensor() tensor = core.LoDTensor()
if isinstance(np_value, tuple): if isinstance(np_value, tuple):
tensor.set(np_value[0], place) tensor.set(
OpTest.np_value_to_fluid_value(np_value[0]), place)
tensor.set_recursive_sequence_lengths(np_value[1]) tensor.set_recursive_sequence_lengths(np_value[1])
else: else:
tensor.set(np_value, place) tensor.set(
OpTest.np_value_to_fluid_value(np_value), place)
feed_map[name] = tensor feed_map[name] = tensor
else: else:
tensor = core.LoDTensor() tensor = core.LoDTensor()
if isinstance(self.inputs[var_name], tuple): if isinstance(self.inputs[var_name], tuple):
tensor.set(self.inputs[var_name][0], place) tensor.set(
OpTest.np_value_to_fluid_value(self.inputs[var_name][
0]), place)
tensor.set_recursive_sequence_lengths(self.inputs[var_name][ tensor.set_recursive_sequence_lengths(self.inputs[var_name][
1]) 1])
else: else:
tensor.set(self.inputs[var_name], place) tensor.set(
OpTest.np_value_to_fluid_value(self.inputs[var_name]),
place)
feed_map[var_name] = tensor feed_map[var_name] = tensor
return feed_map return feed_map
...@@ -309,13 +335,22 @@ class OpTest(unittest.TestCase): ...@@ -309,13 +335,22 @@ class OpTest(unittest.TestCase):
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
str(actual_t) + "\n" + str(expect_t)) "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t))
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
def _get_places(self): def _get_places(self):
if self.dtype == np.float16:
if core.is_compiled_with_cuda() and core.op_support_gpu(
self.op_type):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
return [place]
else:
return []
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
...@@ -346,9 +381,9 @@ class OpTest(unittest.TestCase): ...@@ -346,9 +381,9 @@ class OpTest(unittest.TestCase):
def err_msg(): def err_msg():
offset = np.argmax(diff_mat > max_relative_error) offset = np.argmax(diff_mat > max_relative_error)
return ("%s Variable %s max gradient diff %f over limit %f, " return ("%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d, %f, %f") % ( "the first error element is %d, expected %f, but got %f"
msg_prefix, name, max_diff, max_relative_error, ) % (msg_prefix, name, max_diff, max_relative_error,
offset, a.flatten()[offset], b.flatten()[offset]) offset, a.flatten()[offset], b.flatten()[offset])
self.assertLessEqual(max_diff, max_relative_error, err_msg()) self.assertLessEqual(max_diff, max_relative_error, err_msg())
...@@ -437,6 +472,21 @@ class OpTest(unittest.TestCase): ...@@ -437,6 +472,21 @@ class OpTest(unittest.TestCase):
input.dtype = np.uint16 input.dtype = np.uint16
return input return input
@staticmethod
def fluid_dtype_to_np_dtype(self, dtype):
"""
See above, convert the dtype to normal type.
"""
if dtype == np.uint16:
dtype = np.float16
return dtype
@staticmethod
def np_value_to_fluid_value(input):
if input.dtype == np.float16:
input = input.view(np.uint16)
return input
def _get_gradient(self, def _get_gradient(self,
input_to_check, input_to_check,
place, place,
...@@ -459,7 +509,7 @@ class OpTest(unittest.TestCase): ...@@ -459,7 +509,7 @@ class OpTest(unittest.TestCase):
if isinstance(place, fluid.CUDAPlace(0)): if isinstance(place, fluid.CUDAPlace(0)):
use_cuda = True use_cuda = True
executor = fluid.ParallelExecutor( executor = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=loss.name, main_program=program) use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
else: else:
executor = Executor(place) executor = Executor(place)
return list( return list(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import unittest
import os
import sys
import signal
import subprocess
class TestDistBase(unittest.TestCase):
def setUp(self):
self._trainers = 2
self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python"
def start_pserver(self, model_file):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers)
ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers)
ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ps0_proc, ps1_proc
def _wait_ps_ready(self, pid):
retry_times = 50
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error as e:
sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
(e, retry_times))
retry_times -= 1
def check_with_place(self, model_file, delta=1e-3):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15"
}
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
(self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_local)
local_proc.wait()
out, err = local_proc.communicate()
local_ret = out
sys.stderr.write('local_loss: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % err)
# Run dist train to compare with local results
ps0, ps1 = self.start_pserver(model_file)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers)
tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w')
tr0_proc = subprocess.Popen(
tr0_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env1)
tr0_proc.wait()
tr1_proc.wait()
out, err = tr0_proc.communicate()
sys.stderr.write('dist_stderr: %s\n' % err)
loss_data0 = out
sys.stderr.write('dist_loss: %s\n' % loss_data0)
lines = loss_data0.split("\n")
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
FNULL.close()
...@@ -11,127 +11,14 @@ ...@@ -11,127 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import argparse
import time
import math
import unittest import unittest
import os from test_dist_base import TestDistBase
import sys
import signal
import subprocess
class TestDistSeResneXt2x2(unittest.TestCase):
def setUp(self):
self._trainers = 2
self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python"
def start_pserver(self):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ps0_proc, ps1_proc
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def test_with_place(self):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15"
}
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
(self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_local)
local_proc.wait()
out, err = local_proc.communicate()
local_ret = out
sys.stderr.write('local_loss: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % err)
# Run dist train to compare with local results
ps0, ps1 = self.start_pserver()
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w')
tr0_proc = subprocess.Popen(
tr0_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env1)
tr0_proc.wait()
tr1_proc.wait()
out, err = tr0_proc.communicate()
sys.stderr.write('dist_stderr: %s\n' % err)
loss_data0 = out
sys.stderr.write('dist_loss: %s\n' % loss_data0)
lines = loss_data0.split("\n")
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
self.assertAlmostEqual(local_first_loss, dist_first_loss)
self.assertAlmostEqual(local_last_loss, dist_last_loss)
# check tr0_out class TestDistSeResneXt2x2(TestDistBase):
# FIXME: ensure the server process is killed def test_se_resnext(self):
# replace with ps0.terminate() # TODO(paddle-dev): Is the delta too large?
os.kill(ps0.pid, signal.SIGKILL) self.check_with_place("dist_se_resnext.py", delta=0.2)
os.kill(ps1.pid, signal.SIGKILL)
FNULL.close()
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test_dist_base import TestDistBase
class TestDistTransformer2x2(TestDistBase):
def test_transformer(self):
# TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000
self.check_with_place("dist_transformer.py", delta=400)
if __name__ == "__main__":
unittest.main()
...@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest): ...@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_sub" self.op_type = "elementwise_sub"
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), 'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") 'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
} }
self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
......
...@@ -17,6 +17,8 @@ import numpy as np ...@@ -17,6 +17,8 @@ import numpy as np
import math import math
from op_test import OpTest from op_test import OpTest
np.random.seed(100)
def find_latest_set(num): def find_latest_set(num):
return 1 + int(math.floor(math.log(num, 2))) return 1 + int(math.floor(math.log(num, 2)))
......
...@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase): ...@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_shape(self):
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input, name="shape")
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase): ...@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
print(str(result_program)) print(str(result_program))
class TestMemoryTranspiler2(unittest.TestCase):
def setUp(self):
program = Program()
with program_guard(program, startup_program=Program()):
x = layers.data(name='x', shape=[13], dtype='float32')
fc = layers.fc(input=x, size=10, act=None)
reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
y_predict = layers.fc(input=fc, size=1, act=None)
y = layers.data(name='y', shape=[1], dtype='float32')
cost = layers.square_error_cost(input=y_predict, label=y)
avg_cost = layers.mean(cost)
opt = optimizer.SGD(learning_rate=0.001)
opt.minimize(avg_cost)
self.program = program
def test_inplace_ops(self):
print("before optimization")
print(str(self.program))
result_program = memory_optimize(self.program)
print("after optimization")
print(str(result_program))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
self.check_batchnorm_fc_convergence(False) self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
self._compare_reduce_and_allreduce(fc_with_batchnorm, True) # FIXME(zcd): close this test temporally.
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self._compare_reduce_and_allreduce(fc_with_batchnorm, False) self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
......
...@@ -21,7 +21,7 @@ import paddle ...@@ -21,7 +21,7 @@ import paddle
import paddle.dataset.wmt16 as wmt16 import paddle.dataset.wmt16 as wmt16
import os import os
WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
class ModelHyperParams(object): class ModelHyperParams(object):
...@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
writer.append_tensor(t) writer.append_tensor(t)
writer.complete_append_tensor() writer.complete_append_tensor()
@unittest.skip("transformer is buggy in multi gpu")
def test_main(self): def test_main(self):
self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence(transformer, use_cuda=True)
self.check_network_convergence(transformer, use_cuda=False) self.check_network_convergence(transformer, use_cuda=False, iter=5)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest): ...@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
self.op_type = "reshape" self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape, "inplace": False} self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self): def test_check_output(self):
...@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest): ...@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
self.op_type = "reshape" self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape, "inplace": False} self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
def test_check_output(self): def test_check_output(self):
...@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest): ...@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
self.op_type = "reshape" self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape, "inplace": False} self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self): def test_check_output(self):
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestSplitIdsOp(OpTest): class TestSplitIdsOp(OpTest):
...@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest): ...@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
self.check_output() self.check_output()
class TestSpliteIds(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
return places
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
def check_with_place(self, place):
scope = core.Scope()
rows = [0, 5, 7, 4, 9]
height = 20
row_numel = 2
# initialize input variable X
x = scope.var('X').get_selected_rows()
x.set_rows(rows)
x.set_height(height)
np_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
for j in range(row_numel):
np_array[i, j] = rows[i] + j
x_tensor = x.get_tensor()
x_tensor.set(np_array, place)
outs_name = ["out%d" % i for i in xrange(3)]
outs = [
scope.var(var_name).get_selected_rows() for var_name in outs_name
]
# expected output selected rows
expected_out_rows = [[0, 9], [7, 4], [5]]
op = Operator("split_ids", Ids="X", Out=outs_name)
for _ in range(3):
op.run(scope, place)
for i in range(len(outs)):
expected_rows = expected_out_rows[i]
self.assertEqual(outs[i].rows(), expected_rows)
for j in range(len(expected_rows)):
row = expected_rows[j]
self.assertAlmostEqual(
float(row), np.array(outs[i].get_tensor())[j, 0])
self.assertAlmostEqual(
float(row + 1), np.array(outs[i].get_tensor())[j, 1])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -18,14 +18,6 @@ import paddle.fluid.core as core ...@@ -18,14 +18,6 @@ import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
def as_lodtensor(np_array, lod, place):
tensor = core.LoDTensor()
tensor.set(np_value, place)
if lod is not None:
tensor.set_recursive_sequence_lengths(lod)
return tensor
def create_op(scope, op_type, inputs, outputs, attrs): def create_op(scope, op_type, inputs, outputs, attrs):
kwargs = dict() kwargs = dict()
...@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): ...@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
def set_input(scope, op, inputs, place): def set_input(scope, op, inputs, place):
def np_value_to_fluid_value(input):
if input.dtype == np.float16:
input = input.view(np.uint16)
return input
def __set_input__(var_name, var): def __set_input__(var_name, var):
if isinstance(var, tuple) or isinstance(var, np.ndarray): if isinstance(var, tuple) or isinstance(var, np.ndarray):
tensor = scope.find_var(var_name).get_tensor() tensor = scope.find_var(var_name).get_tensor()
...@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place): ...@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
tensor.set_recursive_sequence_lengths(var[1]) tensor.set_recursive_sequence_lengths(var[1])
var = var[0] var = var[0]
tensor._set_dims(var.shape) tensor._set_dims(var.shape)
tensor.set(var, place) tensor.set(np_value_to_fluid_value(var), place)
elif isinstance(var, float): elif isinstance(var, float):
scope.find_var(var_name).set_float(var) scope.find_var(var_name).set_float(var)
elif isinstance(var, int): elif isinstance(var, int):
...@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): ...@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if name not in np_list: if name not in np_list:
assert var_proto.intermediate, "{} not found".format(name) assert var_proto.intermediate, "{} not found".format(name)
else: else:
# inferece the dtype from numpy value.
np_value = np_list[name] np_value = np_list[name]
if isinstance(np_value, tuple): if isinstance(np_value, tuple):
dtype = np_value[0].dtype dtype = np_value[0].dtype
...@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): ...@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if is_input: if is_input:
shape = list(np_value.shape) shape = list(np_value.shape)
lod_level = 0 lod_level = 0
# NOTE(dzhwinter): type hacking
# numpy float16 is binded to paddle::platform::float16
# in tensor_py.h via the help of uint16 datatype. Because
# the internal memory representation of float16 is
# actually uint16_t in paddle. So we use np.uint16 in numpy for
# raw memory, it can pass through the pybind. So in the testcase,
# we feed data use data.view(uint16), but the dtype is float16 in fact.
# The data.view(uint16) means do not cast the data type, but process data as the uint16
if dtype == np.uint16:
dtype = np.float16
return block.create_var( return block.create_var(
dtype=dtype, shape=shape, lod_level=lod_level, name=name) dtype=dtype, shape=shape, lod_level=lod_level, name=name)
......
...@@ -404,7 +404,7 @@ def transformer( ...@@ -404,7 +404,7 @@ def transformer(
trg_pad_idx, trg_pad_idx,
pos_pad_idx, ): pos_pad_idx, ):
file_obj = fluid.layers.open_recordio_file( file_obj = fluid.layers.open_recordio_file(
filename='./wmt16.recordio', filename='/tmp/wmt16.recordio',
shapes=[ shapes=[
[batch_size * max_length, 1], [batch_size * max_length, 1],
[batch_size * max_length, 1], [batch_size * max_length, 1],
......
...@@ -494,6 +494,7 @@ class DistributeTranspiler(object): ...@@ -494,6 +494,7 @@ class DistributeTranspiler(object):
pserver_index = self.pserver_endpoints.index(endpoint) pserver_index = self.pserver_endpoints.index(endpoint)
table_opt_block = self._create_table_optimize_block( table_opt_block = self._create_table_optimize_block(
pserver_index, pserver_program, pre_block_idx, grad_to_block_id) pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
optimize_blocks.append(table_opt_block)
prefetch_var_name_to_block_id = self._create_prefetch_block( prefetch_var_name_to_block_id = self._create_prefetch_block(
pserver_index, pserver_program, table_opt_block) pserver_index, pserver_program, table_opt_block)
checkpoint_block_id = self._create_checkpoint_save_block( checkpoint_block_id = self._create_checkpoint_save_block(
......
...@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ...@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
COPY build_scripts /build_scripts COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \ RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts bash build_scripts/install_nccl2.sh && rm -r build_scripts
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册