提交 44c7beb0 编写于 作者: V velconia

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into mac_py3

...@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
endif() endif()
if(WIN32) if(WIN32)
# windows stupid compile option for all targets. # windows header option for all targets.
add_definitions(-D_XKEYCHECK_H) add_definitions(-D_XKEYCHECK_H)
# Use symbols instead of absolute path, reduce the cmake link command length.
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
# Specify the program to use when building static libraries
SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
# set defination for the dll export
if (NOT MSVC)
message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
endif(NOT MSVC)
endif(WIN32) endif(WIN32)
if(NOT WITH_GOLANG) if(NOT WITH_GOLANG)
......
...@@ -110,6 +110,20 @@ function(find_fluid_modules TARGET_NAME) ...@@ -110,6 +110,20 @@ function(find_fluid_modules TARGET_NAME)
endif() endif()
endfunction(find_fluid_modules) endfunction(find_fluid_modules)
# find all third_party modules is used for paddle static library
# for reduce the dependency when building the inference libs.
set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
function(find_fluid_thirdparties TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "third_party" pos)
if(pos GREATER 1)
get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
endif()
endfunction(find_fluid_thirdparties)
function(merge_static_libs TARGET_NAME) function(merge_static_libs TARGET_NAME)
set(libs ${ARGN}) set(libs ${ARGN})
list(REMOVE_DUPLICATES libs) list(REMOVE_DUPLICATES libs)
...@@ -204,18 +218,13 @@ function(merge_static_libs TARGET_NAME) ...@@ -204,18 +218,13 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs}) foreach(lib ${libs})
# Get the file names of the libraries to be merged # Get the file names of the libraries to be merged
#if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
# message("library" ${lib})
# set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
#else()
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>) set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
#endif()
endforeach() endforeach()
# msvc will put libarary in directory of "/Release/xxxlib" by default
# windows cmd return error in clean env. # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
# COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles} COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
) )
endif(WIN32) endif(WIN32)
endfunction(merge_static_libs) endfunction(merge_static_libs)
......
...@@ -2,6 +2,6 @@ ...@@ -2,6 +2,6 @@
Thanks for reading PaddlePaddle documentation. Thanks for reading PaddlePaddle documentation.
Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo. Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there.
Please turn to Fluiddoc Repo for the latest documentation. Please turn to FluidDoc Repo for the latest documentation.
此差异已折叠。
...@@ -13,3 +13,5 @@ if(WITH_INFERENCE) ...@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
endif() endif()
add_subdirectory(train)
...@@ -20,41 +20,79 @@ namespace paddle { ...@@ -20,41 +20,79 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
template <class T> // Change it to thread safe flags if needed.
class COWPtr { class ThreadUnsafeOwnershipFlags {
public: public:
typedef std::shared_ptr<T> RefPtr; explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
private: ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
RefPtr m_sp; ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void detach() { void SetOwnership(bool flag) { flag_ = flag; }
T* tmp = m_sp.get();
if (!(tmp == nullptr || m_sp.unique())) { // Invoke the callback if it is not owned.
m_sp = RefPtr(new T(*tmp)); template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
} }
} }
public: private:
COWPtr() : m_sp(nullptr) {} bool flag_;
explicit COWPtr(T* t) : m_sp(t) {} };
explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}
const T& Data() const { return operator*(); } // Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
T* MutableData() { return operator->(); } // Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
const T& operator*() const { return *m_sp; } // Copy methods. Not own payload
T& operator*() { COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
detach(); COWPtr& operator=(const COWPtr& other) {
return *m_sp; payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
} }
const T* operator->() const { return m_sp.operator->(); }
T* operator->() { // Access read only data.
detach(); const T& Data() const { return *payload_; }
return m_sp.operator->();
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
} }
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
// Ownership flag.
OwnershipFlags ownership_;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -30,14 +30,6 @@ TEST(COWPtr, all) { ...@@ -30,14 +30,6 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars( ...@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
return recv_vars; return recv_vars;
} }
bool MultiDevSSAGraphBuilder::IsDistTrainOp(
ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const {
if (send_vars.size() == 0 || recv_vars.size() == 0) {
return false;
}
/**
* Check any of opvars contains `.block` and in sendvars
*/
auto checker = [](const std::vector<std::string> &opvars,
const std::vector<std::string> &rpc_vars) -> bool {
for (auto &var : opvars) {
// a variable name with the suffix `.block` means it's a splited
// variable by (DistributeTranspiler)
// [python/paddle/fluid/transpiler/distribute_transpiler.py]
if (var.find(".block") != std::string::npos &&
std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
return true;
}
}
return false;
};
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
}
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const { const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0; int64_t numel_sum = 0;
...@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
} }
} }
is_dist_train = true; is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) { } else if (boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kDist)) {
int op_dev_id = CreateDistTrainOp(&result, node); int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") { if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0]; auto origin_param_name = node->Op()->OutputArgumentNames()[0];
...@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
} }
} else { } else {
LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
PADDLE_THROW( PADDLE_THROW(
"the distribute training related op should be in [split_byref, " "the distribute training related op should be in [split_byref, "
"concat]."); "concat].");
......
...@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
*/
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const;
std::vector<std::string> FindDistTrainSendVars( std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
namespace paddle { namespace paddle {
...@@ -46,17 +47,15 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -46,17 +47,15 @@ class ReferenceCountOpHandle : public OpHandleBase {
const std::vector<std::string> &var_names, const std::vector<std::string> &var_names,
GarbageCollector<Tensor> *gc, GarbageCollector<Tensor> *gc,
AtomicReferenceCountMap *ref_cnts) AtomicReferenceCountMap *ref_cnts)
: OpHandleBase(node), : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
scope_(scope),
var_names_(var_names),
gc_(gc),
ref_cnts_(ref_cnts) {
dev_ctx_ = static_cast<platform::CUDADeviceContext *>( dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
if (IsStreamGarabageCollector()) { if (IsStreamGarabageCollector()) {
PADDLE_ENFORCE(cudaSetDevice(place.device)); PADDLE_ENFORCE(cudaSetDevice(place.device));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
} }
for (auto &name : var_names) AddVar(name);
} }
~ReferenceCountOpHandle() { ~ReferenceCountOpHandle() {
...@@ -69,19 +68,35 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -69,19 +68,35 @@ class ReferenceCountOpHandle : public OpHandleBase {
std::string Name() const override { return "reference_count"; } std::string Name() const override { return "reference_count"; }
void AddVar(const std::string &name) {
auto it = var_names_.find(name);
if (it != var_names_.end())
++(it->second);
else
var_names_[name] = 1;
}
protected: protected:
void RunImpl() override { void RunImpl() override {
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
std::vector<LoDTensor *> tensors; std::vector<Tensor *> tensors;
for (auto &name : var_names_) { for (auto &pair : var_names_) {
auto &name = pair.first;
auto it = ref_cnts_->find(name); auto it = ref_cnts_->find(name);
if (it == ref_cnts_->end()) continue; if (it == ref_cnts_->end()) continue;
auto *var = exec_scope->FindVar(name); auto *var = exec_scope->FindVar(name);
if (var == nullptr || !var->IsType<LoDTensor>()) continue; if (var == nullptr) continue;
if (it->second.fetch_sub(1) <= 1) { if (var->IsType<LoDTensor>()) {
tensors.emplace_back(var->GetMutable<LoDTensor>()); if (it->second.fetch_sub(pair.second) <= pair.second) {
tensors.emplace_back(var->GetMutable<LoDTensor>());
}
} else if (var->IsType<SelectedRows>()) {
if (it->second.fetch_sub(pair.second) <= pair.second) {
tensors.emplace_back(
var->GetMutable<SelectedRows>()->mutable_value());
}
} }
} }
...@@ -91,7 +106,7 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -91,7 +106,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
} }
private: private:
void ClearTensors(const std::vector<LoDTensor *> &tensors) { void ClearTensors(const std::vector<Tensor *> &tensors) {
auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_); auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
if (gc != nullptr) { if (gc != nullptr) {
auto compute_stream = dev_ctx_->stream(); auto compute_stream = dev_ctx_->stream();
...@@ -112,7 +127,7 @@ class ReferenceCountOpHandle : public OpHandleBase { ...@@ -112,7 +127,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
const Scope *scope_; const Scope *scope_;
platform::CUDADeviceContext *dev_ctx_; platform::CUDADeviceContext *dev_ctx_;
std::vector<std::string> var_names_; std::unordered_map<std::string, int> var_names_;
GarbageCollector<Tensor> *gc_; // not own GarbageCollector<Tensor> *gc_; // not own
AtomicReferenceCountMap *ref_cnts_; // not own AtomicReferenceCountMap *ref_cnts_; // not own
cudaEvent_t event_; cudaEvent_t event_;
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <queue>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -23,6 +24,25 @@ namespace paddle { ...@@ -23,6 +24,25 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
std::queue<VarHandleBase *> queue;
queue.push(var_in);
do {
auto *var = queue.front();
queue.pop();
for (auto *op : var->PendingOps()) {
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
return compute_op;
}
for (auto *out_var : op->Outputs()) {
queue.push(out_var);
}
}
} while (!queue.empty());
return nullptr;
}
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount); auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
...@@ -34,6 +54,9 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -34,6 +54,9 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// Step 2: Find all variables in non-computation ops which refers to variables // Step 2: Find all variables in non-computation ops which refers to variables
// in computation ops // in computation ops
std::unordered_set<std::string> names; std::unordered_set<std::string> names;
std::unordered_map<OpHandleBase *, std::unique_ptr<ReferenceCountOpHandle>>
compute_ref_cnt_map;
auto get_ref_cnts_from_compute_op = [&]( auto get_ref_cnts_from_compute_op = [&](
const std::unique_ptr<OpHandleBase> &op, const std::unique_ptr<OpHandleBase> &op,
const std::vector<VarHandleBase *> &vars) { const std::vector<VarHandleBase *> &vars) {
...@@ -54,15 +77,18 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -54,15 +77,18 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
VarDesc *var_desc = var_handle->Node()->Var(); VarDesc *var_desc = var_handle->Node()->Var();
auto var_name = var_handle->Node()->Name(); auto var_name = var_handle->Node()->Name();
// This is wierd but there is really some variables without var_desc // This is weird but there is really some variables without var_desc
// in computation_op // in computation_op
if (var_desc == nullptr) { if (var_desc == nullptr) {
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
continue; continue;
} else { } else {
if (var_desc->Persistable() || if (var_desc->Persistable()) continue;
var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR) auto var_type = var_desc->Proto()->type().type();
if (var_type != proto::VarType::LOD_TENSOR &&
var_type != proto::VarType::SELECTED_ROWS) {
continue; continue;
}
} }
// compute op only runs in one device // compute op only runs in one device
...@@ -93,12 +119,33 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -93,12 +119,33 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if (ref_cnts.count(place.device) && if (ref_cnts.count(place.device) &&
ref_cnts[place.device]->count(var_name)) { ref_cnts[place.device]->count(var_name)) {
++(*ref_cnts[place.device])[var_name]; ++(*ref_cnts[place.device])[var_name];
auto *next_compute_op = FindNextComputationOpHandle(var_handle);
if (next_compute_op != nullptr) {
if (compute_ref_cnt_map.count(next_compute_op)) {
compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
VLOG(5) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name();
} else {
// Create new reference_count_op_handle
ir::Node *ref_cnt_node = graph->CreateEmptyNode(
"reference_count", ir::Node::Type::kOperation);
auto *ref_cnt_handle = new ReferenceCountOpHandle(
ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
gcs[place.device].get(), cur_ref_cnts[place.device].get());
if (next_compute_op->Outputs().empty()) {
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
next_compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
ref_cnt_handle->AddInput(next_compute_op->Outputs().front());
compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle);
}
}
} }
} }
}; };
std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
compute_ref_cnt_map;
auto &all_ops = graph->Get<GraphOps>(kGraphOps); auto &all_ops = graph->Get<GraphOps>(kGraphOps);
for (auto &op : all_ops) { for (auto &op : all_ops) {
auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
...@@ -113,11 +160,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -113,11 +160,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
auto *ref_cnt_handle = new ReferenceCountOpHandle( auto *ref_cnt_handle = new ReferenceCountOpHandle(
ref_cnt_node, compute_op->GetScope(), place, in_var_names, ref_cnt_node, compute_op->GetScope(), place, in_var_names,
gcs[place.device].get(), cur_ref_cnts[place.device].get()); gcs[place.device].get(), cur_ref_cnts[place.device].get());
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); if (compute_op->Outputs().empty()) {
compute_op->AddOutput(dep_var); auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
ref_cnt_handle->AddInput(dep_var); compute_op->AddOutput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var); graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
compute_ref_cnt_map[compute_op] = ref_cnt_handle; }
ref_cnt_handle->AddInput(compute_op->Outputs().front());
compute_ref_cnt_map[compute_op].reset(ref_cnt_handle);
} }
for (auto &op : all_ops) { for (auto &op : all_ops) {
...@@ -131,7 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -131,7 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
new_all_ops.emplace_back(std::move(op)); new_all_ops.emplace_back(std::move(op));
auto it = compute_ref_cnt_map.find(new_all_ops.back().get()); auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
if (it != compute_ref_cnt_map.end()) { if (it != compute_ref_cnt_map.end()) {
new_all_ops.emplace_back(it->second); // Add LeafNode to ReferenceCountOpHandle
auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
it->second->AddOutput(dummy_leaf);
new_all_ops.emplace_back(std::move(it->second));
} }
} }
......
...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
...@@ -46,11 +51,13 @@ struct EigenTensor { ...@@ -46,11 +51,13 @@ struct EigenTensor {
using ConstType = using ConstType =
Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>; Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
static Type From(Tensor& tensor, DDim dims) { static Type From(Tensor& tensor, DDim dims) { // NOLINT
return Type(tensor.data<T>(), EigenDim<D>::From(dims)); return Type(tensor.data<T>(), EigenDim<D>::From(dims));
} }
static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); } static Type From(Tensor& tensor) { // NOLINT
return From(tensor, tensor.dims_);
} // NOLINT
static ConstType From(const Tensor& tensor, DDim dims) { static ConstType From(const Tensor& tensor, DDim dims) {
return ConstType(tensor.data<T>(), EigenDim<D>::From(dims)); return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
...@@ -64,7 +71,8 @@ struct EigenTensor { ...@@ -64,7 +71,8 @@ struct EigenTensor {
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> { struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) { static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT
int num_col_dims) {
int rank = tensor.dims_.size(); int rank = tensor.dims_.size();
PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
"`num_col_dims` must be between (0, rank_of_tensor)."); "`num_col_dims` must be between (0, rank_of_tensor).");
...@@ -86,11 +94,12 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -86,11 +94,12 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> { struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten reshapes a Tensor into an EigenVector. // Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) { static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT
return EigenVector::From(tensor, {product(tensor.dims_)}); return EigenVector::From(tensor, {product(tensor.dims_)});
} }
static typename EigenVector::ConstType Flatten(const Tensor& tensor) { static typename EigenVector::ConstType Flatten(
const Tensor& tensor) { // NOLINT
return EigenVector::From(tensor, {product(tensor.dims_)}); return EigenVector::From(tensor, {product(tensor.dims_)});
} }
}; };
...@@ -104,7 +113,7 @@ struct EigenScalar { ...@@ -104,7 +113,7 @@ struct EigenScalar {
using ConstType = Eigen::TensorMap< using ConstType = Eigen::TensorMap<
Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>; Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
static Type From(Tensor& tensor) { return Type(tensor.data<T>()); } static Type From(Tensor& tensor) { return Type(tensor.data<T>()); } // NOLINT
static ConstType From(const Tensor& tensor) { static ConstType From(const Tensor& tensor) {
return ConstType(tensor.data<T>()); return ConstType(tensor.data<T>());
......
...@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* conv_input = gpd.mutable_pattern() auto* conv_input = gpd.mutable_pattern()
->NewNode("conv_relu_mkldnn_fuse/conv_input") ->NewNode("conv_relu_mkldnn_fuse/conv_input")
...@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvReLU fuse"; VLOG(4) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op
// Create an ConvReLU Node. // Transform Conv node into ConvReLU node.
OpDesc desc; OpDesc* desc = conv->Op();
std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
std::string conv_relu_w_in = conv_weight->Name(); desc->SetAttr("fuse_relu", true);
std::string conv_relu_b_in = conv_bias->Name(); GraphSafeRemoveNodes(graph.get(), {relu, conv_out});
std::string conv_relu_out = relu_out->Name();
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
desc.SetAttr("fuse_relu", true);
auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
PADDLE_ENFORCE(subgraph.count(conv_input)); PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); IR_NODE_LINK_TO(conv, relu_out);
IR_NODE_LINK_TO(conv_weight, conv_relu_node);
IR_NODE_LINK_TO(conv_bias, conv_relu_node);
IR_NODE_LINK_TO(conv_relu_node, relu_out);
found_conv_relu_count++; found_conv_relu_count++;
}; };
......
...@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) { ...@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) {
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") { if (node->IsOp() && node->Op()->Type() == "conv2d") {
if (node->Op()->HasAttr("use_mkldnn")) { auto* op = node->Op();
bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn")); ASSERT_TRUE(op->HasAttr("use_mkldnn"));
if (use_mkldnn) { EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
if (node->Op()->HasAttr("fuse_relu")) { ASSERT_TRUE(op->HasAttr("fuse_relu"));
bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu")); bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
if (fuse_relu) { if (fuse_relu) {
++conv_relu_count; ++conv_relu_count;
}
}
}
} }
} }
} }
......
...@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()( ...@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter"); ->assert_is_op_input("conv2d", "Filter");
// Bias
auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Bias");
// intermediate variable, will be removed in the IR after fuse. // intermediate variable, will be removed in the IR after fuse.
auto *conv_out_var = pattern->NewNode(conv_out_repr()) auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate() ->AsIntermediate()
...@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()( ...@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
->AsOutput() ->AsOutput()
->assert_is_op_output("relu"); ->assert_is_op_output("relu");
conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
.LinksTo({conv_out_var});
relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
return relu_out_var; return relu_out_var;
} }
......
...@@ -379,7 +379,7 @@ struct PatternBase { ...@@ -379,7 +379,7 @@ struct PatternBase {
// op: conv + relu // op: conv + relu
// named nodes: // named nodes:
// conv_input, conv_weight, // conv_input, conv_weight,
// conv_bias, conv_out, conv, // conv_out, conv,
// relu_out, relu // relu_out, relu
struct ConvReLU : public PatternBase { struct ConvReLU : public PatternBase {
ConvReLU(PDPattern* pattern, const std::string& name_scope) ConvReLU(PDPattern* pattern, const std::string& name_scope)
...@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase { ...@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
PATTERN_DECL_NODE(relu); PATTERN_DECL_NODE(relu);
// declare variable node's name // declare variable node's name
PATTERN_DECL_NODE(conv_weight); PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_bias);
PATTERN_DECL_NODE(conv_out); PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(relu_out); PATTERN_DECL_NODE(relu_out);
}; };
......
...@@ -17,12 +17,10 @@ ...@@ -17,12 +17,10 @@
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <memory> #include <memory>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -30,401 +28,206 @@ namespace paddle { ...@@ -30,401 +28,206 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
class Vector { class Vector {
public: public:
using value_type = T; using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
private:
// The actual class to implement vector logic
class VectorData {
public:
VectorData() : flag_(kDataInCPU) {}
VectorData(size_t count, const T &value)
: cpu_(count, value), flag_(kDataInCPU) {}
VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
VectorData(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
}
VectorData &operator=(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
}
T &operator[](size_t i) {
MutableCPU();
return cpu_[i];
}
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_[i];
}
size_t size() const { return cpu_.size(); }
iterator begin() {
MutableCPU();
return cpu_.begin();
}
iterator end() {
MutableCPU();
return cpu_.end();
}
T &front() {
MutableCPU();
return cpu_.front();
}
T &back() {
MutableCPU();
return cpu_.back();
}
const_iterator begin() const {
ImmutableCPU();
return cpu_.begin();
}
const_iterator end() const {
ImmutableCPU();
return cpu_.end();
}
const T &back() const {
ImmutableCPU();
return cpu_.back();
}
T *data() { return &(*this)[0]; }
const T *data() const { return &(*this)[0]; }
const T &front() const {
ImmutableCPU();
return cpu_.front();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
MutableCPU();
cpu_.assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
MutableCPU();
cpu_.push_back(elem);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
MutableCPU();
auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
std::copy(begin, end, out_it);
}
// resize the vector
void resize(size_t size) {
MutableCPU();
cpu_.resize(size);
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
}
// clear
void clear() {
cpu_.clear();
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return cpu_.capacity(); }
// reserve data
void reserve(size_t size) { cpu_.reserve(size); }
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const {
ImmutableCPU();
return cpu_;
}
bool operator==(const VectorData &other) const {
ImmutableCPU();
other.ImmutableCPU();
return cpu_ == other.cpu_;
}
private:
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
nullptr);
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
CopyCUDADataToAnotherPlace(place);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
CopyCUDADataToAnotherPlace(place);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void CopyCUDADataToAnotherPlace(const platform::Place &place) const {
details::CUDABuffer tmp(place, gpu_.size_);
const void *src = gpu_.data_;
void *dst = tmp.data_;
memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr);
gpu_.Swap(tmp);
}
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto stream = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const {
if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
};
public:
// Default ctor. Create empty Vector // Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {} Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`. // Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T()) explicit Vector(size_t count, const T &value = T()) {
: m_(new VectorData(count, value)) {} InitEmpty();
if (count != 0) {
resize(count);
T *ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
}
}
// Ctor with init_list // Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {} Vector(std::initializer_list<T> init) {
if (init.size() == 0) {
InitEmpty();
} else {
InitByIter(init.size(), init.begin(), init.end());
}
}
// implicit cast from std::vector. // implicit cast from std::vector.
template <typename U> template <typename U>
Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT Vector(const std::vector<U> &dat) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
}
} }
// Copy ctor // Copy ctor
Vector(const Vector<T> &other) { m_ = other.m_; } Vector(const Vector<T> &other) { this->operator=(other); }
// Copy operator // Copy operator
Vector<T> &operator=(const Vector<T> &other) { Vector<T> &operator=(const Vector<T> &other) {
m_ = other.m_; if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this; return *this;
} }
// Move ctor // Move ctor
Vector(Vector<T> &&other) { m_ = std::move(other.m_); } Vector(Vector<T> &&other) {
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
}
}
// CPU data access method. Mutable. // CPU data access method. Mutable.
T &operator[](size_t i) { return (*m_)[i]; } T &operator[](size_t i) {
MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable. // CPU data access method. Immutable.
const T &operator[](size_t i) const { return (*m_)[i]; } const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
// std::vector iterator methods. Based on CPU data access method // std::vector iterator methods. Based on CPU data access method
size_t size() const { return m_->size(); } size_t size() const { return size_; }
iterator begin() { return m_->begin(); } T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
iterator end() { return m_->end(); } T *end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
T &front() { return m_->front(); } T &front() { return *begin(); }
T &back() { return m_->back(); } T &back() {
auto it = end();
--it;
return *it;
}
const_iterator begin() const { return m_->begin(); } const T *begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
}
const_iterator end() const { return m_->end(); } const T *end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
const_iterator cbegin() const { return begin(); } const T *cbegin() const { return begin(); }
const_iterator cend() const { return end(); } const T *cend() const { return end(); }
const T &back() const { return m_->back(); } const T &back() const {
auto it = end();
--it;
return *it;
}
T *data() { return m_->data(); } T *data() { return begin(); }
const T *data() const { return m_->data(); } const T *data() const { return begin(); }
const T &front() const { return m_->front(); } const T &front() const { return *begin(); }
// end of std::vector iterator methods // end of std::vector iterator methods
// assign this from iterator. // assign this from iterator.
// NOTE: the iterator must support `end-begin` // NOTE: the iterator must support `end-begin`
template <typename Iter> template <typename Iter>
void assign(Iter begin, Iter end) { void assign(Iter begin, Iter end) {
m_->assign(begin, end); InitByIter(end - begin, begin, end);
} }
// push_back. If the previous capacity is not enough, the memory will // push_back. If the previous capacity is not enough, the memory will
// double. // double.
void push_back(T elem) { m_->push_back(elem); } void push_back(T elem) {
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
}
// extend a vector by iterator. // extend a vector by iterator.
// NOTE: the iterator must support end-begin // NOTE: the iterator must support end-begin
template <typename It> template <typename It>
void Extend(It begin, It end) { void Extend(It begin, It end) {
m_->Extend(begin, end); size_t pre_size = size_;
resize(pre_size + (end - begin));
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
} }
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (m_.Data().size() != size) { if (size + 1 <= capacity()) {
m_->resize(size); size_ = size;
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
} }
} }
// get cuda ptr. immutable // get cuda ptr. immutable
const T *CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
return m_.Data().CUDAData(place); PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return cuda_vec_.data<T>();
} }
// get cuda ptr. mutable // get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
return m_->CUDAMutableData(place); const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
} }
// clear // clear
void clear() { m_->clear(); } void clear() {
size_ = 0;
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return m_->capacity(); } size_t capacity() const {
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
// reserve data // reserve data
void reserve(size_t size) { m_->reserve(size); } void reserve(size_t size) {
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable. // the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const { const T *Data(platform::Place place) const {
...@@ -445,7 +248,12 @@ class Vector { ...@@ -445,7 +248,12 @@ class Vector {
} }
// implicit cast operator. Vector can be cast to std::vector implicitly. // implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { return *m_; } operator std::vector<T>() const {
std::vector<T> result;
result.resize(size());
std::copy(begin(), end(), result.begin());
return result;
}
bool operator==(const Vector<T> &other) const { bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
...@@ -459,11 +267,118 @@ class Vector { ...@@ -459,11 +267,118 @@ class Vector {
return true; return true;
} }
const void *Handle() const { return &m_.Data(); }
private: private:
// Vector is an COW object. void InitEmpty() {
details::COWPtr<VectorData> m_; size_ = 0;
flag_ = kDataInCPU;
}
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
}
flag_ = kDataInCPU | kDirty;
size_ = size;
}
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void ImmutableCPU() const {
if (IsDirty() &&
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) {
if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance()
.Get(boost::get<platform::CUDAPlace>(place))
->Wait();
}
}
static T &EmptyDummy() {
static T dummy = T();
return dummy;
}
mutable int flag_;
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_;
size_t size_;
}; };
#else // PADDLE_WITH_CUDA #else // PADDLE_WITH_CUDA
......
...@@ -54,6 +54,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -54,6 +54,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size()); PADDLE_ENFORCE_LT(j, Outputs(out).size());
PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", in, i);
PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", out, j);
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR) { if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
...@@ -63,6 +67,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -63,6 +67,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR, PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j, "The %d-th output of Output(%s) must be LoDTensor.", j,
out); out);
out_var->SetLoDLevel(in_var->GetLoDLevel()); out_var->SetLoDLevel(in_var->GetLoDLevel());
} }
......
...@@ -38,27 +38,31 @@ struct OpInfo { ...@@ -38,27 +38,31 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr}; OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_; InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_; InferShapeFN infer_shape_;
std::string op_type_;
bool HasOpProtoAndChecker() const { bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr; return proto_ != nullptr && checker_ != nullptr;
} }
const proto::OpProto& Proto() const { const proto::OpProto& Proto() const {
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
op_type_);
PADDLE_ENFORCE(proto_->IsInitialized(), PADDLE_ENFORCE(proto_->IsInitialized(),
"Operator Proto must be initialized in op info"); "Operator %s Proto must be initialized in op info",
op_type_);
return *proto_; return *proto_;
} }
const OpCreator& Creator() const { const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL(creator_, PADDLE_ENFORCE_NOT_NULL(
"Operator Creator has not been registered"); creator_, "Operator %s Creator has not been registered", op_type_);
return creator_; return creator_;
} }
const GradOpMakerFN& GradOpMaker() const { const GradOpMakerFN& GradOpMaker() const {
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
"Operator GradOpMaker has not been registered."); "Operator %s GradOpMaker has not been registered.",
op_type_);
return grad_op_maker_; return grad_op_maker_;
} }
...@@ -73,8 +77,9 @@ class OpInfoMap { ...@@ -73,8 +77,9 @@ class OpInfoMap {
return map_.find(op_type) != map_.end(); return map_.find(op_type) != map_.end();
} }
void Insert(const std::string& type, const OpInfo& info) { void Insert(const std::string& type, OpInfo info) {
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
info.op_type_ = type;
map_.insert({type, info}); map_.insert({type, info});
} }
......
...@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
{static_cast<int>(OpRole::kForward), {static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC), static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward), static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
...@@ -131,7 +132,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -131,7 +132,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.") AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
.SetDefault(""); .SetDefault("");
AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
"Callstack for Op Creatation.")
.SetDefault({});
Validate(); Validate();
} }
......
...@@ -26,7 +26,13 @@ enum class OpRole { ...@@ -26,7 +26,13 @@ enum class OpRole {
kForward = 0x0000, kForward = 0x0000,
kBackward = 0x0001, kBackward = 0x0001,
kOptimize = 0x0002, kOptimize = 0x0002,
// RPC role is for send/recv releated op
kRPC = 0x0003, kRPC = 0x0003,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
kDist = 0x0004,
// Tag all learning rate scheduler operators.
kLRSched = 0x0005,
kLoss = 0x0100, kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and // The default value of op's role. This should be only used for unittests and
...@@ -40,6 +46,7 @@ class OpProtoAndCheckerMaker { ...@@ -40,6 +46,7 @@ class OpProtoAndCheckerMaker {
static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpRoleVarAttrName() { return "op_role_var"; }
static const char *OpNamescopeAttrName() { return "op_namescope"; } static const char *OpNamescopeAttrName() { return "op_namescope"; }
static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
......
...@@ -23,6 +23,11 @@ limitations under the License. */ ...@@ -23,6 +23,11 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#if defined(_WIN32)
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif
#include "glog/logging.h" // For VLOG() #include "glog/logging.h" // For VLOG()
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/details/op_registry.h"
...@@ -241,22 +246,20 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -241,22 +246,20 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
* we will use and tell the compiler to * we will use and tell the compiler to
* link them into target. * link them into target.
*/ */
#define USE_OP_ITSELF(op_type) \ #define USE_OP_ITSELF(op_type) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \ __use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \ "USE_OP_ITSELF must be called in global namespace"); \
extern int TouchOpRegistrar_##op_type(); \ extern int TouchOpRegistrar_##op_type(); \
static int use_op_itself_##op_type##_ __attribute__((unused)) = \ UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ #define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ __use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \
"USE_OP_DEVICE_KERNEL must be in global namespace"); \ "USE_OP_DEVICE_KERNEL must be in global namespace"); \
extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \ extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ \ UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \
__attribute__((unused)) = \ TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
// TODO(fengjiayi): The following macros // TODO(fengjiayi): The following macros
// seems ugly, do we have better method? // seems ugly, do we have better method?
......
...@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/operator.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -137,19 +142,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -137,19 +142,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope); try {
if (platform::is_gpu_place(place)) { if (VLOG_IS_ON(4)) {
VLOG(4) << place << " " << DebugStringEx(&scope);
}
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
#else #else
auto dev_id = boost::get<platform::CUDAPlace>(place).device; auto dev_id = boost::get<platform::CUDAPlace>(place).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope);
}
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw exception;
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) {
throw exception;
}
std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n";
sout << "Python Callstacks: \n";
for (auto& line : callstack) {
sout << line;
}
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw exception;
} catch (...) {
std::rethrow_exception(std::current_exception());
} }
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
VLOG(3) << place << " " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
...@@ -177,7 +211,7 @@ const std::vector<std::string>& OperatorBase::Inputs( ...@@ -177,7 +211,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
} }
bool OperatorBase::HasOutputs(const std::string& name) const { bool OperatorBase::HasOutputs(const std::string& name) const {
if (outputs_.find(name) != outputs_.end()) { if (outputs_.end() != outputs_.find(name)) {
return true; return true;
} else { } else {
return false; return false;
......
...@@ -20,6 +20,8 @@ limitations under the License. */ ...@@ -20,6 +20,8 @@ limitations under the License. */
#include <tuple> #include <tuple>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "glog/logging.h" // For VLOG #include "glog/logging.h" // For VLOG
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
......
...@@ -46,6 +46,16 @@ std::vector<DDim> InferShapeContext::GetReaderDims( ...@@ -46,6 +46,16 @@ std::vector<DDim> InferShapeContext::GetReaderDims(
return this->GetRepeatedDims(arg_names[0]); return this->GetRepeatedDims(arg_names[0]);
} }
void InferShapeContext::ShareLoDs(const std::string &in,
const std::string &out) const {
PADDLE_ENFORCE_EQ(Inputs(in).size(), Outputs(out).size(),
"The number of arguments in %s and %s is not equal.", in,
out);
for (size_t i = 0; i < in.size(); ++i) {
ShareLoD(in, out, i, i);
}
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name, DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const { int idx) const {
const std::vector<std::string> &names = Inputs(name); const std::vector<std::string> &names = Inputs(name);
......
...@@ -56,6 +56,8 @@ class InferShapeContext { ...@@ -56,6 +56,8 @@ class InferShapeContext {
virtual const std::vector<std::string> &Outputs( virtual const std::vector<std::string> &Outputs(
const std::string &name) const = 0; const std::string &name) const = 0;
void ShareLoDs(const std::string &in, const std::string &out) const;
virtual void ShareLoD(const std::string &in, const std::string &out, virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0; size_t i = 0, size_t j = 0) const = 0;
......
...@@ -71,15 +71,15 @@ bool AnalysisPredictor::Init( ...@@ -71,15 +71,15 @@ bool AnalysisPredictor::Init(
inference_program_ = paddle::inference::Load( inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file); executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else { } else {
LOG(ERROR) << "fail to load inference model."; LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
return false; return false;
} }
OptimizeInferenceProgram(); OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) { if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_); executor_->EnableMKLDNN(*inference_program_);
} }
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get()); PADDLE_ENFORCE(scope_.get());
...@@ -109,8 +109,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -109,8 +109,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude, PADDLE_ENFORCE(
"Only kExclude is supported yet."); config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
"Only kExclude is supported yet.");
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_); Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
...@@ -126,8 +127,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -126,8 +127,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor>
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) { CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
const contrib::AnalysisConfig& config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
...@@ -154,4 +156,11 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -154,4 +156,11 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
return predictor; return predictor;
} }
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig& config) {
return CreatePaddlePredictor<contrib::AnalysisConfig,
PaddleEngineKind::kAnalysis>(config);
}
} // namespace paddle } // namespace paddle
...@@ -30,7 +30,7 @@ using framework::proto::ProgramDesc; ...@@ -30,7 +30,7 @@ using framework::proto::ProgramDesc;
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public NativePaddlePredictor {
public: public:
explicit AnalysisPredictor(const AnalysisConfig& config) explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
...@@ -46,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { ...@@ -46,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
Argument& analysis_argument() { return argument_; } Argument& analysis_argument() { return argument_; }
private: private:
AnalysisConfig config_; contrib::AnalysisConfig config_;
Argument argument_; Argument argument_;
}; };
......
...@@ -31,21 +31,24 @@ ...@@ -31,21 +31,24 @@
namespace paddle { namespace paddle {
using paddle::contrib::AnakinConfig;
template <typename Target> template <typename Target>
PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor( PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) { const contrib::AnakinConfig &config) {
CHECK(Init(config)); CHECK(Init(config));
} }
template <> template <>
PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor( PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) { const contrib::AnakinConfig &config) {
omp_set_dynamic(0); omp_set_dynamic(0);
omp_set_num_threads(1); omp_set_num_threads(1);
mkl_set_num_threads(1); mkl_set_num_threads(1);
CHECK(Init(config)); CHECK(Init(config));
} }
template <typename Target> template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) { bool PaddleInferenceAnakinPredictor<Target>::Init(
const contrib::AnakinConfig &config) {
if (!(graph_.load(config.model_file))) { if (!(graph_.load(config.model_file))) {
VLOG(3) << "fail to load graph from " << config.model_file; VLOG(3) << "fail to load graph from " << config.model_file;
return false; return false;
...@@ -200,10 +203,11 @@ template class PaddleInferenceAnakinPredictor<anakin::X86>; ...@@ -200,10 +203,11 @@ template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor. // A factory to help create difference predictor.
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor>
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
const contrib::AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create."; VLOG(3) << "Anakin Predictor create.";
if (config.target_type == AnakinConfig::NVGPU) { if (config.target_type == contrib::AnakinConfig::NVGPU) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
...@@ -213,7 +217,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -213,7 +217,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment"; LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
return nullptr; return nullptr;
#endif #endif
} else if (config.target_type == AnakinConfig::X86) { } else if (config.target_type == contrib::AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::X86>(config)); new PaddleInferenceAnakinPredictor<anakin::X86>(config));
......
...@@ -29,6 +29,8 @@ limitations under the License. */ ...@@ -29,6 +29,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
using contrib::AnakinConfig;
template <typename Target> template <typename Target>
class PaddleInferenceAnakinPredictor : public PaddlePredictor { class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public: public:
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -101,14 +102,11 @@ bool NativePaddlePredictor::Init( ...@@ -101,14 +102,11 @@ bool NativePaddlePredictor::Init(
inference_program_ = paddle::inference::Load( inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file); executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else { } else {
LOG(ERROR) << "fail to load inference model."; LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
return false; return false;
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_, executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);
...@@ -218,57 +216,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -218,57 +216,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
template <typename T> template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
PaddleTensor *output) { PaddleTensor *output) {
std::vector<int> shape; // set shape.
auto dims_i = fetch.dims(); auto shape = framework::vectorize(fetch.dims());
auto lod = fetch.lod(); output->shape.assign(shape.begin(), shape.end());
const T *output_ptr = fetch.data<T>(); // set data.
auto num = fetch.numel(); const T *data = fetch.data<T>();
std::vector<T> data; int num_elems = inference::VecReduceToInt(shape);
if (0 == lod.size()) { output->data.Resize(num_elems * sizeof(T));
std::copy(output_ptr, output_ptr + num, std::back_inserter(data)); // The fetched tensor output by fetch op, should always in CPU memory, so just
for (int j = 0; j < dims_i.size(); ++j) { // copy.
shape.push_back(dims_i[j]); memcpy(output->data.data(), data, num_elems * sizeof(T));
} // set lod
} else { output->lod.clear();
// for batch detection for (auto &level : fetch.lod()) {
// image[0] -> output[0] shape {145, 6} output->lod.emplace_back(level.begin(), level.end());
// image[1] -> output[1] shape {176, 6}
// then,
// the batch output shape {321, 6}
// the lod {{0, 145, 321}}
// so we should append output[0] to {176, 6}
size_t max_dim = 0;
for (size_t j = 1; j < lod[0].size(); j++) {
max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
}
size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
if (max_dim > 0) {
data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
}
for (size_t j = 1; j < lod[0].size(); j++) {
size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim;
if (end > start) {
std::copy(output_ptr + start, output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim);
}
}
shape.push_back(lod[0].size() - 1);
shape.push_back(max_dim);
for (int j = 1; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
}
output->shape = shape;
auto &buffer = output->data;
if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
buffer.Resize(sizeof(T) * data.size());
}
std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
// copy LoD
for (const auto &level : fetch.lod()) {
output->lod.emplace_back(level);
} }
} }
...@@ -330,4 +291,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -330,4 +291,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
#endif #endif
} }
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
const NativeConfig &config) {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,12 @@ ...@@ -14,6 +14,12 @@
#pragma once #pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h> #include <glog/logging.h>
#include <map> #include <map>
#include <memory> #include <memory>
......
...@@ -25,10 +25,11 @@ using inference::analysis::Argument; ...@@ -25,10 +25,11 @@ using inference::analysis::Argument;
using inference::Singleton; using inference::Singleton;
using inference::analysis::Analyzer; using inference::analysis::Analyzer;
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
using paddle::contrib::MixedRTConfig;
class TensorRTSubgraphPredictor : public NativePaddlePredictor { class TensorRTSubgraphPredictor : public NativePaddlePredictor {
public: public:
explicit TensorRTSubgraphPredictor(const TensorRTConfig& config) explicit TensorRTSubgraphPredictor(const MixedRTConfig& config)
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
...@@ -115,13 +116,13 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -115,13 +116,13 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
} }
private: private:
TensorRTConfig config_; MixedRTConfig config_;
}; };
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const TensorRTConfig& config) { const MixedRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor"; VLOG(3) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
...@@ -150,6 +151,13 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( ...@@ -150,6 +151,13 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
return std::move(predictor); return std::move(predictor);
} }
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<MixedRTConfig>(
const MixedRTConfig& config) {
return CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
}
} // namespace paddle } // namespace paddle
USE_TRT_CONVERTER(elementwise_add_weight); USE_TRT_CONVERTER(elementwise_add_weight);
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
namespace paddle { namespace paddle {
using contrib::MixedRTConfig;
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
void CompareTensorRTWithFluid(bool enable_tensorrt) { void CompareTensorRTWithFluid(bool enable_tensorrt) {
...@@ -32,7 +34,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -32,7 +34,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config0.fraction_of_gpu_memory = 0.3; config0.fraction_of_gpu_memory = 0.3;
config0.device = 0; config0.device = 0;
TensorRTConfig config1; MixedRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.use_gpu = true; config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3; config1.fraction_of_gpu_memory = 0.3;
...@@ -42,7 +44,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -42,7 +44,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
auto predictor0 = auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 = auto predictor1 =
CreatePaddlePredictor<TensorRTConfig, CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1); PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 1; batch_id++) { for (int batch_id = 0; batch_id < 1; batch_id++) {
......
cmake_minimum_required(VERSION 3.0) cmake_minimum_required(VERSION 3.0)
project(cpp_inference_demo CXX C) project(cpp_inference_demo CXX C)
option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
macro(safe_set_static_flag)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (WIN32) if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "lib") if (WITH_STATIC_LIB)
safe_set_static_flag()
add_definitions(-DSTATIC_LIB)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
endif()
set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
else() else()
set(CMAKE_STATIC_LIBRARY_PREFIX "") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CMAKE_STATIC_LIBRARY_PREFIX "")
endif() endif()
message("flags" ${CMAKE_CXX_FLAGS})
if(NOT DEFINED PADDLE_LIB) if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
...@@ -16,14 +35,18 @@ if(NOT DEFINED DEMO_NAME) ...@@ -16,14 +35,18 @@ if(NOT DEFINED DEMO_NAME)
message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
endif() endif()
option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
if(WITH_GPU) if(WITH_GPU)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") if(NOT WIN32)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
else()
if(CUDA_LIB STREQUAL "")
set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
endif()
endif(NOT WIN32)
endif() endif()
include_directories("D:/Paddle/")
include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include")
...@@ -83,10 +106,18 @@ set(DEPS ${DEPS} ...@@ -83,10 +106,18 @@ set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${MATH_LIB} ${MKLDNN_LIB}
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${EXTERNAL_LIB}) ${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated.
set(DEPS ${DEPS} libcmt shlwapi)
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) if(NOT WIN32)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
else()
set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
endif()
endif() endif()
target_link_libraries(${DEMO_NAME} ${DEPS}) target_link_libraries(${DEMO_NAME} ${DEPS})
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm>
#include <memory> #include <memory>
#include <thread> //NOLINT #include <thread> //NOLINT
#include "paddle/fluid/inference/paddle_inference_api.h" #include "paddle/fluid/inference/paddle_inference_api.h"
...@@ -67,7 +69,8 @@ void Main(bool use_gpu) { ...@@ -67,7 +69,8 @@ void Main(bool use_gpu) {
0.000932706}; 0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
i++) {
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]); result[i]);
} }
...@@ -113,7 +116,8 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -113,7 +116,8 @@ void MainThreads(int num_threads, bool use_gpu) {
const size_t num_elements = const size_t num_elements =
outputs.front().data.length() / sizeof(float); outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
i++) {
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]); result[i]);
} }
......
# windows inference
本文介绍windows inference,目前只提供了静态编译,编译出paddle_fluid.lib,包含了除openblas.dll之外的所有第三方依赖库。
1. 下载最新的paddle_fluid.lib和openblas.dll,并把它们放在同一个目录下。
2. 准备预训练好的模型文件,例如models中的模型,可以将模型用safe_inference_model接口保存下来。将模型文件放到该目录下
3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录,新建build目录,然后使用cmake生成vs2015的solution文件。
其中PADDLE_LIB是前面的paddle_fluid.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。
```shell
cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_fluid.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
```
然后用vs2015打开对应的项目文件,注意使用静态链接 "/MT",生成对应的exe。将openblas.dll放到exe所在目录。
4. 该exe即为项目生成文件,可绑定运行。
## FAQ
1. cmake需要您手动下载,并添加到系统路径里
2. 路径中的不要包含空格,例如发现CUDA_LIB路径是Program Files(x86)可能会出错。可以将CUDA拷贝到一个新位置。
...@@ -74,13 +74,17 @@ template <> ...@@ -74,13 +74,17 @@ template <>
std::string to_string<std::vector<std::vector<float>>>( std::string to_string<std::vector<std::vector<float>>>(
const std::vector<std::vector<std::vector<float>>> &vec); const std::vector<std::vector<std::vector<float>>> &vec);
template <typename T>
int VecReduceToInt(const std::vector<T> &v) {
return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; });
}
template <typename T> template <typename T>
static void TensorAssignData(PaddleTensor *tensor, static void TensorAssignData(PaddleTensor *tensor,
const std::vector<std::vector<T>> &data) { const std::vector<std::vector<T>> &data) {
// Assign buffer // Assign buffer
int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, int num_elems = VecReduceToInt(tensor->shape);
[](int a, int b) { return a * b; }); tensor->data.Resize(sizeof(T) * num_elems);
tensor->data.Resize(sizeof(T) * dim);
int c = 0; int c = 0;
for (const auto &f : data) { for (const auto &f : data) {
for (T v : f) { for (T v : f) {
...@@ -89,7 +93,7 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -89,7 +93,7 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
} }
std::string DescribeTensor(const PaddleTensor &tensor) { static std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os; std::stringstream os;
os << "Tensor [" << tensor.name << "]\n"; os << "Tensor [" << tensor.name << "]\n";
os << " - type: "; os << " - type: ";
...@@ -113,8 +117,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) { ...@@ -113,8 +117,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
os << "\n"; os << "\n";
os << " - data: "; os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, int dim = VecReduceToInt(tensor.shape);
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) { for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " "; os << static_cast<float *>(tensor.data.data())[i] << " ";
} }
...@@ -122,8 +125,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) { ...@@ -122,8 +125,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
return os.str(); return os.str();
} }
void PrintTime(int batch_size, int repeat, int num_threads, int tid, static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency, int epoch = 1) { double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid << ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======"; << ", latency: " << latency << "ms ======";
......
...@@ -28,34 +28,61 @@ limitations under the License. */ ...@@ -28,34 +28,61 @@ limitations under the License. */
namespace paddle { namespace paddle {
// Data type.
enum PaddleDType { enum PaddleDType {
FLOAT32, FLOAT32,
INT64, INT64,
// TODO(Superjomn) support more data types if needed.
}; };
/*
* Memory menage for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
*
* For user allocated memory, the following API can be used:
* - PaddleBuf(void* data, size_t length) to set an external memory by
* specifying
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
*
* To have the PaddleBuf allocate and manage the memory:
* - PaddleBuf(size_t length) will allocate a memory of size `length`.
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/
class PaddleBuf { class PaddleBuf {
public: public:
PaddleBuf() = default; // PaddleBuf allocate memory internally, and manage it.
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
explicit PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Set external memory, the PaddleBuf won't manage it.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
// Resize the memory.
void Resize(size_t length); void Resize(size_t length);
// Reset to external memory. // Reset to external memory, with address and length set.
void Reset(void* data, size_t length); void Reset(void* data, size_t length);
// Tell whether the buffer is empty.
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
// Get the memory address.
void* data() const { return data_; } void* data() const { return data_; }
// Get the memory length.
size_t length() const { return length_; } size_t length() const { return length_; }
~PaddleBuf() { Free(); } ~PaddleBuf() { Free(); }
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
private: private:
void Free(); void Free();
...@@ -64,6 +91,7 @@ class PaddleBuf { ...@@ -64,6 +91,7 @@ class PaddleBuf {
bool memory_owned_{true}; bool memory_owned_{true};
}; };
// Basic input and output data structure for PaddlePredictor.
struct PaddleTensor { struct PaddleTensor {
PaddleTensor() = default; PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
...@@ -73,19 +101,8 @@ struct PaddleTensor { ...@@ -73,19 +101,8 @@ struct PaddleTensor {
std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor
}; };
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
kAnalysis
// TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
};
/* /*
* A simple Inference API for Paddle. Currently this API can be used by * A simple Inference API for Paddle.
* non-sequence scenerios.
*/ */
class PaddlePredictor { class PaddlePredictor {
public: public:
...@@ -120,26 +137,53 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -120,26 +137,53 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields. // GPU related fields.
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input.
bool specify_input_name{false};
// Specify the exact path of program and parameter files.
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool specify_input_name{false};
}; };
// Configurations for Anakin engine. // A factory to help create different predictors.
struct AnakinConfig : public PaddlePredictor::Config { //
enum TargetType { NVGPU = 0, X86 }; // Usage:
int device; //
std::string model_file; // NativeConfig config;
int max_batch_size{-1}; // ... // change the configs.
TargetType target_type; // auto native_predictor = CreatePaddlePredictor(config);
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type. Similar configs can be
// merged, but there shouldn't be a huge config containing different fields for
// more than one kind of predictors.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// NOTE The following APIs are too trivial, we will discard it in the following
// versions.
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
kAnalysis, // More optimization.
kAnakin // Use Anakin for inference, not mature yet.
}; };
struct TensorRTConfig : public NativeConfig { template <typename ConfigT, PaddleEngineKind engine>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// ==
//
// -----------------------------------------------------------------------------------
// NOTE: The following APIs are not mature yet, we are still working on them.
namespace contrib {
// Accelerate GPU computation with TensorRT engine.
struct MixedRTConfig : public NativeConfig {
// Determine whether a subgraph will be executed by TRT. // Determine whether a subgraph will be executed by TRT.
int min_subgraph_size{1}; int min_subgraph_size{1};
// While TensorRT allows an engine optimized for a given max batch size // While TensorRT allows an engine optimized for a given max batch size
...@@ -154,7 +198,6 @@ struct TensorRTConfig : public NativeConfig { ...@@ -154,7 +198,6 @@ struct TensorRTConfig : public NativeConfig {
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig { struct AnalysisConfig : public NativeConfig {
//
enum class IrPassMode { enum class IrPassMode {
kSystem, // Use system default passes, not customize. kSystem, // Use system default passes, not customize.
kInclude, // Specify the passes in `ir_passes`. kInclude, // Specify the passes in `ir_passes`.
...@@ -165,18 +208,21 @@ struct AnalysisConfig : public NativeConfig { ...@@ -165,18 +208,21 @@ struct AnalysisConfig : public NativeConfig {
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default. // attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"}; std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
// NOTE this is just for internal development, please not use it.
bool _use_mkldnn{false};
}; };
// A factory to help create different predictors. // Configurations for Anakin engine.
// struct AnakinConfig : public PaddlePredictor::Config {
// FOR EXTENSION DEVELOPER: enum TargetType { NVGPU = 0, X86 };
// Different predictors are designated by config type and engine kind. Similar int device;
// configs can be merged, but there shouldn't be a huge config containing std::string model_file;
// different fields for more than one kind of predictors. int max_batch_size{-1};
// TargetType target_type;
// Similarly, each engine kind should map to a unique predictor implementation. };
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); } // namespace contrib
int PaddleDtypeSize(PaddleDType dtype); int PaddleDtypeSize(PaddleDType dtype);
......
...@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi ...@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
# seq_conv1
set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR}) if (NOT EXISTS ${OCR_INSTALL_DIR})
......
...@@ -22,10 +22,10 @@ DEFINE_string(model, "", "Directory of the inference model(mobile_v2)."); ...@@ -22,10 +22,10 @@ DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
namespace paddle { namespace paddle {
AnakinConfig GetConfig() { contrib::AnakinConfig GetConfig() {
AnakinConfig config; contrib::AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference // using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::NVGPU; config.target_type = contrib::AnakinConfig::NVGPU;
config.model_file = FLAGS_model; config.model_file = FLAGS_model;
config.device = 0; config.device = 0;
config.max_batch_size = 1; config.max_batch_size = 1;
...@@ -33,9 +33,10 @@ AnakinConfig GetConfig() { ...@@ -33,9 +33,10 @@ AnakinConfig GetConfig() {
} }
TEST(inference, anakin) { TEST(inference, anakin) {
AnakinConfig config = GetConfig(); auto config = GetConfig();
auto predictor = auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config); CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
config);
float data[1 * 3 * 224 * 224] = {1.0f}; float data[1 * 3 * 224 * 224] = {1.0f};
PaddleTensor tensor; PaddleTensor tensor;
......
...@@ -97,10 +97,10 @@ void Data::get_batch_data( ...@@ -97,10 +97,10 @@ void Data::get_batch_data(
namespace paddle { namespace paddle {
AnakinConfig GetConfig() { contrib::AnakinConfig GetConfig() {
AnakinConfig config; contrib::AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference // using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::X86; config.target_type = contrib::AnakinConfig::X86;
config.model_file = FLAGS_model; config.model_file = FLAGS_model;
config.device = 0; config.device = 0;
config.max_batch_size = 1000; // the max number of token config.max_batch_size = 1000; // the max number of token
...@@ -121,9 +121,10 @@ void set_tensor(std::string name, std::vector<int> shape, ...@@ -121,9 +121,10 @@ void set_tensor(std::string name, std::vector<int> shape,
} }
void single_test() { void single_test() {
AnakinConfig config = GetConfig(); auto config = GetConfig();
auto predictor = auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config); CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
config);
int max_batch_size = 1000; int max_batch_size = 1000;
std::string feature_file = FLAGS_datapath; std::string feature_file = FLAGS_datapath;
......
...@@ -95,7 +95,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -95,7 +95,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param"; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false; cfg->use_gpu = false;
...@@ -117,7 +117,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -117,7 +117,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_Chinese_ner, profile) { TEST(Analyzer_Chinese_ner, profile) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -141,7 +141,7 @@ TEST(Analyzer_Chinese_ner, profile) { ...@@ -141,7 +141,7 @@ TEST(Analyzer_Chinese_ner, profile) {
// Check the fuse status // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) { TEST(Analyzer_Chinese_ner, fuse_statis) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -155,7 +155,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { ...@@ -155,7 +155,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) { TEST(Analyzer_Chinese_ner, compare) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
......
...@@ -149,7 +149,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -149,7 +149,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param"; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false; cfg->use_gpu = false;
...@@ -172,7 +172,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -172,7 +172,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) { TEST(Analyzer_rnn1, profile) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -183,7 +183,7 @@ TEST(Analyzer_rnn1, profile) { ...@@ -183,7 +183,7 @@ TEST(Analyzer_rnn1, profile) {
// Check the fuse status // Check the fuse status
TEST(Analyzer_rnn1, fuse_statis) { TEST(Analyzer_rnn1, fuse_statis) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -198,7 +198,7 @@ TEST(Analyzer_rnn1, fuse_statis) { ...@@ -198,7 +198,7 @@ TEST(Analyzer_rnn1, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_rnn1, compare) { TEST(Analyzer_rnn1, compare) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -208,7 +208,7 @@ TEST(Analyzer_rnn1, compare) { ...@@ -208,7 +208,7 @@ TEST(Analyzer_rnn1, compare) {
// Test Multi-Thread. // Test Multi-Thread.
TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
struct DataRecord {
std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
std::vector<std::vector<int64_t>> title1, title2, title3, l1;
std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= title1_all.size()) {
data.title1_all.assign(title1_all.begin() + batch_iter,
title1_all.begin() + batch_end);
data.title2_all.assign(title2_all.begin() + batch_iter,
title2_all.begin() + batch_end);
data.title3_all.assign(title3_all.begin() + batch_iter,
title3_all.begin() + batch_end);
data.l1_all.assign(l1_all.begin() + batch_iter,
l1_all.begin() + batch_end);
// Prepare LoDs
data.title1_lod.push_back(0);
data.title2_lod.push_back(0);
data.title3_lod.push_back(0);
data.l1_lod.push_back(0);
CHECK(!data.title1_all.empty());
CHECK(!data.title2_all.empty());
CHECK(!data.title3_all.empty());
CHECK(!data.l1_all.empty());
CHECK_EQ(data.title1_all.size(), data.title2_all.size());
CHECK_EQ(data.title1_all.size(), data.title3_all.size());
CHECK_EQ(data.title1_all.size(), data.l1_all.size());
for (size_t j = 0; j < data.title1_all.size(); j++) {
data.title1.push_back(data.title1_all[j]);
data.title2.push_back(data.title2_all[j]);
data.title3.push_back(data.title3_all[j]);
data.l1.push_back(data.l1_all[j]);
// calculate lod
data.title1_lod.push_back(data.title1_lod.back() +
data.title1_all[j].size());
data.title2_lod.push_back(data.title2_lod.back() +
data.title2_all[j].size());
data.title3_lod.push_back(data.title3_lod.back() +
data.title3_all[j].size());
data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, '\t', &data);
// load title1 data
std::vector<int64_t> title1_data;
split_to_int64(data[0], ' ', &title1_data);
// load title2 data
std::vector<int64_t> title2_data;
split_to_int64(data[1], ' ', &title2_data);
// load title3 data
std::vector<int64_t> title3_data;
split_to_int64(data[2], ' ', &title3_data);
// load l1 data
std::vector<int64_t> l1_data;
split_to_int64(data[3], ' ', &l1_data);
title1_all.push_back(std::move(title1_data));
title2_all.push_back(std::move(title2_data));
title3_all.push_back(std::move(title3_data));
l1_all.push_back(std::move(l1_data));
}
num_samples = num_lines;
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
title1_tensor.name = "title1";
title2_tensor.name = "title2";
title3_tensor.name = "title3";
l1_tensor.name = "l1";
auto one_batch = data->NextBatch();
int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
title1_tensor.shape.assign({title1_size, 1});
title1_tensor.lod.assign({one_batch.title1_lod});
int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
title2_tensor.shape.assign({title2_size, 1});
title2_tensor.lod.assign({one_batch.title2_lod});
int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
title3_tensor.shape.assign({title3_size, 1});
title3_tensor.lod.assign({one_batch.title3_lod});
int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
l1_tensor.shape.assign({l1_size, 1});
l1_tensor.lod.assign({one_batch.l1_lod});
// assign data
TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
// Set inputs.
input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::INT64;
}
}
void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
// Easy for profiling independently.
TEST(Analyzer_seq_conv1, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
// output is probability, which is in (0, 1).
for (size_t i = 0; i < size; i++) {
EXPECT_GT(result[i], 0);
EXPECT_LT(result[i], 1);
}
}
}
// Check the fuse status
TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_seq_conv1, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference
} // namespace paddle
...@@ -38,6 +38,8 @@ DEFINE_bool(use_analysis, true, ...@@ -38,6 +38,8 @@ DEFINE_bool(use_analysis, true,
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL); EXPECT_GT(outputs.size(), 0UL);
...@@ -45,11 +47,8 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -45,11 +47,8 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i]; auto &out = outputs[i];
auto &ref_out = ref_outputs[i]; auto &ref_out = ref_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, size_t size = VecReduceToInt(out.shape);
[](int a, int b) { return a * b; }); size_t ref_size = VecReduceToInt(ref_out.shape);
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_GT(size, 0); EXPECT_GT(size, 0);
EXPECT_EQ(size, ref_size); EXPECT_EQ(size, ref_size);
EXPECT_EQ(out.dtype, ref_out.dtype); EXPECT_EQ(out.dtype, ref_out.dtype);
...@@ -74,25 +73,22 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -74,25 +73,22 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config, std::unique_ptr<PaddlePredictor> CreateTestPredictor(
bool use_analysis = true) { const AnalysisConfig &config, bool use_analysis = true) {
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>( return CreatePaddlePredictor<contrib::AnalysisConfig,
config); PaddleEngineKind::kAnalysis>(config);
} else { } else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>( return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config); config);
} }
} }
size_t GetSize(const PaddleTensor &out) { size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
return std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
}
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config, std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
int *num_ops) { int *num_ops) {
auto predictor = GetPrediction(config); auto predictor = CreateTestPredictor(config);
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument() auto &fuse_statis = analysis_predictor->analysis_argument()
...@@ -113,11 +109,12 @@ std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config, ...@@ -113,11 +109,12 @@ std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
} }
void TestOneThreadPrediction( void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = GetPrediction(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
Timer timer; Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
...@@ -130,7 +127,8 @@ void TestOneThreadPrediction( ...@@ -130,7 +127,8 @@ void TestOneThreadPrediction(
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
...@@ -140,7 +138,7 @@ void TestMultiThreadPrediction( ...@@ -140,7 +138,7 @@ void TestMultiThreadPrediction(
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged. // because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(GetPrediction(config, use_analysis)); predictors.emplace_back(CreateTestPredictor(config, use_analysis));
} }
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
...@@ -164,8 +162,8 @@ void TestMultiThreadPrediction( ...@@ -164,8 +162,8 @@ void TestMultiThreadPrediction(
} }
} }
void TestPrediction(AnalysisConfig config, void TestPrediction(const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) { bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis; LOG(INFO) << "use_analysis: " << use_analysis;
...@@ -178,8 +176,8 @@ void TestPrediction(AnalysisConfig config, ...@@ -178,8 +176,8 @@ void TestPrediction(AnalysisConfig config,
} }
void CompareNativeAndAnalysis( void CompareNativeAndAnalysis(
AnalysisConfig config, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ...@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
} }
}; };
__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC( UNUSED constexpr char SigmoidDoc[] = R"DOC(
Sigmoid Activation Operator Sigmoid Activation Operator
$$out = \frac{1}{1 + e^{-x}}$$ $$out = \frac{1}{1 + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC( UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
Logsigmoid Activation Operator Logsigmoid Activation Operator
$$out = \\log \\frac{1}{1 + e^{-x}}$$ $$out = \\log \\frac{1}{1 + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ExpDoc[] = R"DOC( UNUSED constexpr char ExpDoc[] = R"DOC(
Exp Activation Operator. Exp Activation Operator.
$out = e^x$ $out = e^x$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ReluDoc[] = R"DOC( UNUSED constexpr char ReluDoc[] = R"DOC(
Relu Activation Operator. Relu Activation Operator.
$out = \max(x, 0)$ $out = \max(x, 0)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char TanhDoc[] = R"DOC( UNUSED constexpr char TanhDoc[] = R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
TanhShrink Activation Operator. TanhShrink Activation Operator.
$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC( UNUSED constexpr char SqrtDoc[] = R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
$out = \sqrt{x}$ $out = \sqrt{x}$
)DOC"; )DOC";
__attribute__((unused)) constexpr char AbsDoc[] = R"DOC( UNUSED constexpr char AbsDoc[] = R"DOC(
Abs Activation Operator. Abs Activation Operator.
$out = |x|$ $out = |x|$
)DOC"; )DOC";
__attribute__((unused)) constexpr char CeilDoc[] = R"DOC( UNUSED constexpr char CeilDoc[] = R"DOC(
Ceil Activation Operator. Ceil Activation Operator.
$out = ceil(x)$ $out = ceil(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char FloorDoc[] = R"DOC( UNUSED constexpr char FloorDoc[] = R"DOC(
Floor Activation Operator. Floor Activation Operator.
$out = floor(x)$ $out = floor(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char CosDoc[] = R"DOC( UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Activation Operator. Cosine Activation Operator.
$out = cos(x)$ $out = cos(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SinDoc[] = R"DOC( UNUSED constexpr char SinDoc[] = R"DOC(
Sine Activation Operator. Sine Activation Operator.
$out = sin(x)$ $out = sin(x)$
)DOC"; )DOC";
__attribute__((unused)) constexpr char RoundDoc[] = R"DOC( UNUSED constexpr char RoundDoc[] = R"DOC(
Round Activation Operator. Round Activation Operator.
$out = [x]$ $out = [x]$
)DOC"; )DOC";
__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC( UNUSED constexpr char ReciprocalDoc[] = R"DOC(
Reciprocal Activation Operator. Reciprocal Activation Operator.
$$out = \\frac{1}{x}$$ $$out = \\frac{1}{x}$$
)DOC"; )DOC";
__attribute__((unused)) constexpr char LogDoc[] = R"DOC( UNUSED constexpr char LogDoc[] = R"DOC(
Log Activation Operator. Log Activation Operator.
$out = \ln(x)$ $out = \ln(x)$
...@@ -212,21 +213,21 @@ Natural logarithm of x. ...@@ -212,21 +213,21 @@ Natural logarithm of x.
)DOC"; )DOC";
__attribute__((unused)) constexpr char SquareDoc[] = R"DOC( UNUSED constexpr char SquareDoc[] = R"DOC(
Square Activation Operator. Square Activation Operator.
$out = x^2$ $out = x^2$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC( UNUSED constexpr char SoftplusDoc[] = R"DOC(
Softplus Activation Operator. Softplus Activation Operator.
$out = \ln(1 + e^{x})$ $out = \ln(1 + e^{x})$
)DOC"; )DOC";
__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC( UNUSED constexpr char SoftsignDoc[] = R"DOC(
Softsign Activation Operator. Softsign Activation Operator.
$$out = \frac{x}{1 + |x|}$$ $$out = \frac{x}{1 + |x|}$$
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h> // for sqrt in CPU and CUDA #include <math.h> // for sqrt in CPU and CUDA
#include <Eigen/Dense> #include <Eigen/Dense>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
...@@ -306,26 +307,43 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -306,26 +307,43 @@ class AdamOpKernel : public framework::OpKernel<T> {
VLOG(3) << "grad row size is 0!!"; VLOG(3) << "grad row size is 0!!";
return; return;
} }
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
scatter::MergeAdd<DeviceContext, T> merge_func; bool is_strict_sorted = true;
auto& grad_merge = *(ctx.scope() for (size_t i = 1; i < cpu_rows.size(); ++i) {
.NewScope() if (cpu_rows[i - 1] >= cpu_rows[i]) {
.Var("sparse_adam_grad_merge") is_strict_sorted = false;
->GetMutable<framework::SelectedRows>()); break;
merge_func(ctx.template device_context<DeviceContext>(), grad, }
&grad_merge); }
const framework::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = &grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter::MergeAdd<DeviceContext, T> merge_func;
auto* grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
merge_func(ctx.template device_context<DeviceContext>(), grad,
grad_merge_var);
grad_merge_ptr = grad_merge_var;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; const int64_t* rows = nullptr;
// When compiled without CUDA, the CUDAMutableData() interface should not be // When compiled without CUDA, the CUDAData() interface should not be
// provided. // provided.
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); rows = grad_merge.rows().CUDAData(ctx.GetPlace());
} else { } else {
#endif #endif
rows = grad_merge.mutable_rows()->data(); rows = grad_merge.rows().data();
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
} }
......
...@@ -94,8 +94,20 @@ class ConcatOpGrad : public framework::OperatorWithKernel { ...@@ -94,8 +94,20 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); auto in_x = "X";
ctx->ShareLoD("X", framework::GradVarName("X")); auto out_x_g_n = framework::GradVarName(in_x);
ctx->SetOutputsDim(out_x_g_n, ctx->GetInputsDim(in_x));
auto &in_names = ctx->Inputs(in_x);
auto &out_names = ctx->Outputs(out_x_g_n);
PADDLE_ENFORCE_EQ(
in_names.size(), out_names.size(),
"The number of arguments in %s[%d] and %s[%d] is not equal.", in_x,
in_names.size(), out_x_g_n, out_names.size());
for (size_t i = 0; i < in_names.size(); ++i) {
if (out_names[i] != framework::kEmptyVarName) {
ctx->ShareLoD(in_x, out_x_g_n, i, i);
}
}
} }
}; };
......
...@@ -23,8 +23,6 @@ namespace operators { ...@@ -23,8 +23,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
static constexpr int kROISize = 4;
template <typename T> template <typename T>
bool GT_E(T a, T b) { bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4; return (a > b) || fabs(a - b) < 1e-4;
......
...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto& label_lod = in_label->lod(); auto label_lod = in_label->lod();
auto& detect_lod = in_detect->lod(); auto detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto& label_lod = input_label.lod(); auto label_lod = input_label.lod();
auto& detect_lod = input_detect.lod(); auto detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto& label_index = label_lod[0]; auto label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
...@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
...@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto& pos_data_lod = pos_tensor.lod()[0]; auto pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
...@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto& image_gt_boxes = gt_boxes[n]; auto image_gt_boxes = gt_boxes[n];
for (auto& image_gt_box : image_gt_boxes) { for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
size_t count = 0; size_t count = 0;
auto& labeled_bboxes = image_gt_box.second; auto labeled_bboxes = it->second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (auto& box : labeled_bboxes) { for (size_t i = 0; i < labeled_bboxes.size(); ++i)
if (!box.is_difficult) { if (!(labeled_bboxes[i].is_difficult)) ++count;
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = image_gt_box.first; int label = it->first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {
......
...@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData( ...@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
::google::protobuf::io::CodedInputStream* input, ::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, const framework::DDim& dims, const platform::DeviceContext& ctx, const framework::DDim& dims,
int length) { int length) {
auto server_var = GetVar();
if (!server_var) {
LOG(ERROR) << "recved var should not on current server: "
<< meta_.varname();
return false;
}
auto* tensor = GetVar()->GetMutable<framework::LoDTensor>(); auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
tensor->Resize(dims); tensor->Resize(dims);
framework::LoD lod; framework::LoD lod;
for (int i = 0; i < meta_.lod_level(); ++i) { for (int i = 0; i < meta_.lod_level(); ++i) {
framework::Vector<size_t> v; framework::Vector<size_t> v;
...@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData( ...@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(
void* tensor_data = void* tensor_data =
tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false; return false;
} }
......
...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { ...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto &in_rows = in.rows(); auto in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
...@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data, memory::Copy(
boost::get<platform::CUDAPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(out_place), out_data,
in1_value.numel() * sizeof(T), context.stream()); boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
...@@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
...@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
...@@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
......
...@@ -20,7 +20,9 @@ limitations under the License. */ ...@@ -20,7 +20,9 @@ limitations under the License. */
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;
...@@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) {
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;
......
...@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims, ...@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims,
dims2str(paddings) + pooling_type + suffix; dims2str(paddings) + pooling_type + suffix;
} }
static inline int ComputeCeiledOutput(int input_size, int kernel_size,
int padding, int stride) {
return (input_size - kernel_size + 2 * padding) / stride + 1;
}
static inline void CorrectOutputSize(
const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
const std::vector<int>& kernel_size, const std::vector<int>& paddings,
const std::vector<int>& strides,
std::vector<int>& right_bot_padding) { // NOLINT
for (size_t i = 0; i < right_bot_padding.size(); i++) {
int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
paddings[i], strides[i]);
if (desired_size != dst_tz[i + 2]) {
right_bot_padding[i] += strides[i];
}
}
}
template <typename T> template <typename T>
class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto pool_p = auto pool_p =
std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p)); std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
if (pool_p == nullptr) { if (pool_p == nullptr) {
const std::vector<int>& padding_left_top(paddings);
std::vector<int> padding_right_bottom(paddings);
bool ceil_mode = ctx.Attr<bool>("ceil_mode");
if (ceil_mode) {
CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
padding_right_bottom);
}
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), input_format); src_tz, platform::MKLDNNGetDataType<T>(), input_format);
...@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn::memory::format::any); mkldnn::memory::format::any);
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd = std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
pooling_type, mkldnn_engine); padding_right_bottom, ksize, pooling_type,
mkldnn_engine, ceil_mode);
// save pool_pd into global device context to be referred in backward path // save pool_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_pool_pd, pool_pd); dev_ctx.SetBlob(key_pool_pd, pool_pd);
...@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
private: private:
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc( std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
const std::vector<int>& stride, const std::vector<int>& padding, const std::vector<int>& stride, const std::vector<int>& padding_left_top,
const std::vector<int>& kernel, const std::string& pooling_type, const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
const mkldnn::engine& engine) const { const std::string& pooling_type, const mkldnn::engine& engine,
bool ceil_mode) const {
auto pool_desc = mkldnn::pooling_forward::desc( auto pool_desc = mkldnn::pooling_forward::desc(
mkldnn::prop_kind::forward, mkldnn::prop_kind::forward,
pooling_type == "max" ? mkldnn::algorithm::pooling_max pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg, : mkldnn::algorithm::pooling_avg,
src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); src, dst, stride, kernel, padding_left_top, padding_right_bot,
mkldnn::padding_kind::zero);
auto p_pool_pd = auto p_pool_pd =
new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
......
...@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference { ...@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference {
framework::VarDesc* reader = block->FindVarRecursive(reader_name); framework::VarDesc* reader = block->FindVarRecursive(reader_name);
auto dtypes = reader->GetDataTypes(); auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
auto lod_levels = reader->GetLoDLevels();
for (size_t i = 0; i < dtypes.size(); ++i) { for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetDataType(dtypes[i]); out.SetDataType(dtypes[i]);
out.SetLoDLevel(lod_levels[i]);
} }
} }
}; };
......
...@@ -46,9 +46,15 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -46,9 +46,15 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
**Scale operator** **Scale operator**
Multiply the input tensor with a float scalar to scale the input tensor. Apply scaling and bias addition to the input tensor.
$$Out = scale*X$$ if bias_after_scale=True:
$$Out = scale*X + bias$$
else:
$$Out = scale*(X + bias)$$
)DOC"); )DOC");
AddAttr<float>("scale", "The scaling factor of the scale operator.") AddAttr<float>("scale", "The scaling factor of the scale operator.")
.SetDefault(1.0); .SetDefault(1.0);
......
...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
} }
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_LT(0, offset_data[i], PADDLE_ENFORCE_LE(0, offset_data[i],
"The offset[%d] must greater than zero.", i); "The offset[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(0, length_data[i], PADDLE_ENFORCE_LT(0, length_data[i],
"The length[%d] must greater than zero.", i); "The length[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
lod[0][i + 1], "The target tensor's length overflow."); lod[0][i + 1], "The target tensor's length overflow.");
} }
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #include <algorithm>
#include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/operators/sgd_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, ...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
} }
} }
template <typename T, int block_size> template <typename T>
__global__ void SparseSGDFunctorKernel(const T* selected_rows, __global__ void SparseSGDFunctorKernel(const T* selected_rows,
const int64_t* rows, const int64_t* rows,
const T* learning_rate, T* tensor_out, const T* learning_rate, T* tensor_out,
int64_t row_numel) { int64_t row_numel, int64_t limit) {
const int ty = blockIdx.y; for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
int tid = threadIdx.x; const T* selected_rows_ptr = selected_rows + i * row_numel;
T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
selected_rows += ty * row_numel; for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
tensor_out += rows[ty] * row_numel; // Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
for (int index = tid; index < row_numel; index += block_size) { paddle::platform::CudaAtomicAdd(
// Since index in rows of SelectedRows can be duplicate, we have to use tensor_out_ptr + index,
// Atomic Operation to avoid concurrent write error. -1.0 * learning_rate[0] * selected_rows_ptr[index]);
paddle::platform::CudaAtomicAdd( }
tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
} }
} }
} // namespace } // namespace
...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>(); auto* out_data = param_out->data<T>();
const int block_size = 256; const int kThreadsPerBlock = 256;
dim3 threads(block_size, 1); int thread_x = kThreadsPerBlock;
dim3 grid(1, in_rows.size()); int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
SparseSGDFunctorKernel< int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
ctx.cuda_device_context().stream()>>>(
in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(), in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
out_data, in_row_numel); out_data, in_row_numel, in_rows.size());
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
......
...@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp { ...@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp {
size_t height = dst_num_rows; size_t height = dst_num_rows;
// do shrink for the top level LoD // do shrink for the top level LoD
if (x_tensor.lod().size() > 0 && if (x_tensor.lod().size() > 0 &&
x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) { x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, if (x_tensor.lod().size() > 1) { // MultiLevel LoD
dst_num_rows, 0); auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
height = lod_offset.second.second; x_tensor.lod(), 0, dst_num_rows, 0);
auto out_lod = out_tensor.mutable_lod(); height = lod_offset.second.second;
framework::AppendLoD(out_lod, lod_offset.first); auto out_lod = out_tensor.mutable_lod();
framework::AppendLoD(out_lod, lod_offset.first);
} else {
// Shrink LoD
auto lod_item = x_tensor.lod()[0];
lod_item.resize(dst_num_rows + 1);
out_tensor.set_lod({lod_item});
const auto &const_lod_item = lod_item;
height = const_lod_item.back();
}
} }
if (dst_num_rows != 0) { if (height != 0) {
out_tensor.mutable_data(place, x_tensor.type()); out_tensor.mutable_data(place, x_tensor.type());
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
...@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
} else { } else {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>(); auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0]; auto height = dout_tensor.dims()[0];
auto slice = dx_tensor.Slice(0, static_cast<int>(height)); if (height != 0) {
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
&slice);
}
if (dx_tensor.dims()[0] > height) { if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice( auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0])); static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
......
...@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
......
...@@ -36,7 +36,7 @@ namespace operators { ...@@ -36,7 +36,7 @@ namespace operators {
using FluidDT = framework::proto::VarType_Type; using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType; using TRT_DT = nvinfer1::DataType;
namespace { namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) { TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) { switch (type) {
......
...@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel {
"Output(Indices) of TopkOp should not be null."); "Output(Indices) of TopkOp should not be null.");
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(input_dims.size(), 2,
"Rank of TopK op's input must be 2.");
const int k = static_cast<int>(ctx->Attrs().Get<int>("k")); const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
......
...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
......
...@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
grid_max_dims_ = GpuMaxGridDim(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
...@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const { ...@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process * max_threads_per_mp;
} }
std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
return grid_max_dims_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
......
...@@ -13,6 +13,7 @@ limitations under the License. */ ...@@ -13,6 +13,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <tuple>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return the max physical thread count in the device context */ /*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const; int GetMaxPhysicalThreadCount() const;
std::tuple<int, int, int> GetMaxGridDims() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
...@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
std::tuple<int, int, int> grid_max_dims_;
int compute_capability; int compute_capability;
int multi_process; int multi_process;
int max_threads_per_mp; int max_threads_per_mp;
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#if defined(_WIN32) #if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max #define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -47,7 +48,7 @@ limitations under the License. */ ...@@ -47,7 +48,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/curand.h" #include "paddle/fluid/platform/dynload/curand.h"
#if !defined(__APPLE__) and !defined(_WIN32) #if !defined(__APPLE__) && !defined(_WIN32)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif // __APPLE__ #endif // __APPLE__
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
#endif #endif
} }
#if !defined(__APPLE__) and !defined(_WIN32) #if !defined(__APPLE__) && !defined(_WIN32)
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
ncclResult_t stat, const Args&... args) { ncclResult_t stat, const Args&... args) {
...@@ -260,14 +261,8 @@ inline void throw_on_error(T e) { ...@@ -260,14 +261,8 @@ inline void throw_on_error(T e) {
} \ } \
} while (false) } while (false)
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
#else #else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__) #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif // REPLACE_ENFORCE_GLOG #endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32 #else // !_WIN32
...@@ -281,6 +276,12 @@ inline void throw_on_error(T e) { ...@@ -281,6 +276,12 @@ inline void throw_on_error(T e) {
#define PADDLE_ENFORCE(x, ...) x #define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32 #endif // !_WIN32
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
* int a = 1; * int a = 1;
...@@ -294,7 +295,7 @@ inline void throw_on_error(T e) { ...@@ -294,7 +295,7 @@ inline void throw_on_error(T e) {
* extra messages is also supported, for example: * extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
*/ */
#if !defined(_WIN32)
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
...@@ -307,6 +308,7 @@ inline void throw_on_error(T e) { ...@@ -307,6 +308,7 @@ inline void throw_on_error(T e) {
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \ do { \
if (UNLIKELY(nullptr == (__VAL))) { \ if (UNLIKELY(nullptr == (__VAL))) { \
...@@ -326,6 +328,27 @@ inline void throw_on_error(T e) { ...@@ -326,6 +328,27 @@ inline void throw_on_error(T e) {
paddle::string::Sprintf("" __VA_ARGS__)); \ paddle::string::Sprintf("" __VA_ARGS__)); \
} \ } \
} while (0) } while (0)
#else
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
if (!((__VAL0)__CMP(__VAL1))) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
} \
} while (0)
#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
do { \
if (nullptr == (__VAL1)) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
} \
} while (0)
#endif // !_WIN32
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) { ...@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
} }
template <typename Function> template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, int limit) { __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x); size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) { if (idx < limit) {
func(idx); func(idx);
} }
} }
template <typename Function>
__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
int grid_dim) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
while (idx < limit) {
func(idx);
idx += grid_dim;
}
}
template <> template <>
struct ForRange<CUDADeviceContext> { struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit) ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {} : dev_ctx_(dev_ctx), limit_(limit) {}
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
constexpr int num_threads = 1024; constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads; int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads; size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) { int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func); if (grid_size < max_grid_dim) {
int grid_size_int = static_cast<int>(grid_size);
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
} else { } else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>( ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
func, limit_); dev_ctx_.stream()>>>(func, limit_,
max_grid_dim);
} }
} }
const CUDADeviceContext& dev_ctx_; const CUDADeviceContext& dev_ctx_;
int limit_; size_t limit_;
}; };
#endif #endif
......
...@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { ...@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
} }
std::tuple<int, int, int> GpuMaxGridDim(int id) {
std::tuple<int, int, int> result;
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
return result;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <tuple>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, ...@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
//! Set memory dst with value count size asynchronously //! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
std::tuple<int, int, int> GpuMaxGridDim(int id);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
......
...@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) { ...@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
.value("Backward", framework::OpRole::kBackward) .value("Backward", framework::OpRole::kBackward)
.value("Optimize", framework::OpRole::kOptimize) .value("Optimize", framework::OpRole::kOptimize)
.value("Loss", framework::OpRole::kLoss) .value("Loss", framework::OpRole::kLoss)
.value("RPC", framework::OpRole::kRPC); .value("RPC", framework::OpRole::kRPC)
.value("Dist", framework::OpRole::kDist)
.value("LRSched", framework::OpRole::kLRSched);
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
...@@ -46,6 +48,9 @@ void BindConstValue(pybind11::module* m) { ...@@ -46,6 +48,9 @@ void BindConstValue(pybind11::module* m) {
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpNameScopeAttrName", "kOpNameScopeAttrName",
framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
op_proto_and_checker_maker.def(
"kOpCreationCallstackAttrName",
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
} }
} // namespace pybind } // namespace pybind
......
function(train_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(train_test_ARGS)
foreach(arg ${train_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_train_${TARGET_NAME}${arg}
SRCS test_train_${TARGET_NAME}.cc
DEPS paddle_fluid_origin
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach()
endfunction(train_test)
if(WITH_TESTING)
train_test(recognize_digits ARGS mlp conv)
endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <fstream>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
DEFINE_string(dirname, "", "Directory of the train model.");
namespace paddle {
void Train() {
CHECK(!FLAGS_dirname.empty());
framework::InitDevices(false);
const auto cpu_place = platform::CPUPlace();
framework::Executor executor(cpu_place);
framework::Scope scope;
auto train_program = inference::Load(
&executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
FLAGS_dirname + "__params_combined__");
std::string loss_name = "";
for (auto op_desc : train_program->Block(0).AllOps()) {
if (op_desc->Type() == "mean") {
loss_name = op_desc->Output("Out")[0];
break;
}
}
PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
// prepare data
auto x_var = scope.Var("img");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({64, 1, 28, 28});
auto x_data = x_tensor->mutable_data<float>(cpu_place);
for (int i = 0; i < 64 * 28 * 28; ++i) {
x_data[i] = 1.0;
}
auto y_var = scope.Var("label");
auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
y_tensor->Resize({64, 1});
auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
for (int i = 0; i < 64 * 1; ++i) {
y_data[i] = static_cast<int64_t>(1);
}
auto loss_var = scope.Var(loss_name);
float first_loss = 0.0;
float last_loss = 0.0;
for (int i = 0; i < 100; ++i) {
executor.Run(*train_program.get(), &scope, 0, false, true);
if (i == 0) {
first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
} else if (i == 99) {
last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
}
}
EXPECT_LT(last_loss, first_loss);
}
TEST(train, recognize_digits) { Train(); }
} // namespace paddle
...@@ -157,6 +157,7 @@ function cmake_gen() { ...@@ -157,6 +157,7 @@ function cmake_gen() {
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
...@@ -188,7 +189,8 @@ EOF ...@@ -188,7 +189,8 @@ EOF
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
} }
...@@ -371,7 +373,7 @@ EOF ...@@ -371,7 +373,7 @@ EOF
ctest --output-on-failure ctest --output-on-failure
# make install should also be test when unittest # make install should also be test when unittest
make install -j `nproc` make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version paddle version
fi fi
......
...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ...@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
] ]
for name in names: for name in names:
for line in f.extractfile(name): for line in f.extractfile(name):
line_split = line.strip().split(six.b('\t')) line = cpt.to_text(line)
line_split = line.strip().split('\t')
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_seq = line_split[0] # one source sequence src_seq = line_split[0] # one source sequence
......
...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int) word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"): for line in f.extractfile("wmt16/train"):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: continue if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1] sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split(): for w in sen.split():
...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): ...@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name): for line in f.extractfile(file_name):
line_split = line.strip().split(six.b("\t")) line = cpt.to_text(line)
line_split = line.strip().split("\t")
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_words = line_split[src_col].split() src_words = line_split[src_col].split()
......
...@@ -19,17 +19,8 @@ from .framework import * ...@@ -19,17 +19,8 @@ from .framework import *
# import all class inside executor into fluid module # import all class inside executor into fluid module
from . import executor from . import executor
from .executor import * from .executor import *
from . import trainer from . import trainer
from .trainer import Trainer
from .trainer import BeginEpochEvent
from .trainer import EndEpochEvent
from .trainer import BeginStepEvent
from .trainer import EndStepEvent
from .trainer import CheckpointConfig
from . import inferencer from . import inferencer
from .inferencer import Inferencer
from . import io from . import io
from . import evaluator from . import evaluator
...@@ -46,7 +37,7 @@ from . import transpiler ...@@ -46,7 +37,7 @@ from . import transpiler
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
......
...@@ -280,7 +280,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -280,7 +280,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
group_scale_name = self.group_name + "_scale" group_scale_name = self.group_name + "_scale"
if group_scale_name not in self.context: if group_scale_name not in self.context:
group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sums(input=self.context[self.group_name])
layers.sqrt(x=group_norm_var, out=group_norm_var) group_norm_var = layers.sqrt(x=group_norm_var)
clip_var = self.context[self.group_name + "_clip"] clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div( group_scale_var = layers.elementwise_div(
x=clip_var, x=clip_var,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import contextlib
from .. import core
from .. import executor
from .. import framework
from .. import io
from .. import parallel_executor
from .. import unique_name
from .trainer import check_and_get_place
__all__ = ['Inferencer', ]
class Inferencer(object):
"""
Inferencer High Level API.
Args:
infer_func (Python func): Infer function that will return predict Variable
param_path (str): The path where the inference model is saved by fluid.io.save_params
place (Place): place to do the inference
parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
Examples:
.. code-block:: python
def inference_program():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
return y_predict
place = fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path="/tmp/model", place=place)
"""
def __init__(self, infer_func, param_path, place=None, parallel=False):
self.param_path = param_path
self.scope = core.Scope()
self.parallel = parallel
self.place = check_and_get_place(place)
self.inference_program = framework.Program()
with framework.program_guard(self.inference_program):
with unique_name.guard():
self.predict_var = infer_func()
with self._prog_and_scope_guard():
# load params from param_path into scope
io.load_params(executor.Executor(self.place), param_path)
if parallel:
with self._prog_and_scope_guard():
self.exe = parallel_executor.ParallelExecutor(
use_cuda=isinstance(self.place, core.CUDAPlace),
loss_name=self.predict_var.name)
else:
self.exe = executor.Executor(self.place)
self.inference_program = self.inference_program.clone(for_test=True)
def infer(self, inputs, return_numpy=True):
"""
Do Inference for Inputs
Args:
inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
return_numpy (bool): transform return value into numpy or not
Returns:
Tensor or Numpy: the predict value of the inference model for the inputs
Examples:
.. code-block:: python
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
"""
if not isinstance(inputs, dict):
raise ValueError(
"inputs should be a map of {'input_name': input_var}")
with self._prog_and_scope_guard():
results = self.exe.run(feed=inputs,
fetch_list=[self.predict_var.name],
return_numpy=return_numpy)
return results
@contextlib.contextmanager
def _prog_and_scope_guard(self):
with framework.program_guard(main_program=self.inference_program):
with executor.scope_guard(self.scope):
yield
此差异已折叠。
...@@ -18,6 +18,7 @@ import collections ...@@ -18,6 +18,7 @@ import collections
import contextlib import contextlib
import re import re
import six import six
import traceback
import numpy as np import numpy as np
...@@ -34,11 +35,12 @@ except ImportError as e: ...@@ -34,11 +35,12 @@ except ImportError as e:
except Exception as e: except Exception as e:
raise e raise e
from . import unique_name from . import unique_name
import os
PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None
__all__ = [ __all__ = [
'Program', 'Program',
'Operator', 'Operator',
'Parameter',
'default_startup_program', 'default_startup_program',
'default_main_program', 'default_main_program',
'program_guard', 'program_guard',
...@@ -489,7 +491,9 @@ class OpProtoHolder(object): ...@@ -489,7 +491,9 @@ class OpProtoHolder(object):
def generated_op_attr_names(): def generated_op_attr_names():
return { return {
core.op_proto_and_checker_maker.kOpRoleAttrName(), core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName() core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
} }
...@@ -572,6 +576,11 @@ class Operator(object): ...@@ -572,6 +576,11 @@ class Operator(object):
if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del op_attrs[role_var_name] del op_attrs[role_var_name]
if not PADDLE_ON_MODEL_CE:
callstack_var_name = op_maker.kOpCreationCallstackAttrName()
op_attrs[callstack_var_name] = list(
reversed(traceback.format_stack()))[1:]
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
if type is None: if type is None:
...@@ -1509,6 +1518,30 @@ class Program(object): ...@@ -1509,6 +1518,30 @@ class Program(object):
self._op_role_var = [] self._op_role_var = []
self._current_role = OpRole.Forward self._current_role = OpRole.Forward
@contextlib.contextmanager
def _lr_schedule_guard(self):
"""
A with guard to set :code:`LRSched` :code:`OpRole` and
:code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
set to the target learning rate.
Notes: This is a very low level API. Users should not use it directly.
Examples:
>>> p, g = backward(...)
>>> with program.lr_schedule_guard():
>>> lr = lr * decay
"""
OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.LRSched
# TODO(typhoonzero): how to set target learning rate var
self._op_role_var = []
yield
self._op_role_var = []
self._current_role = OpRole.Forward
def __str__(self): def __str__(self):
""" """
Get the protobuf debug string of this Program. Get the protobuf debug string of this Program.
......
...@@ -12,101 +12,5 @@ ...@@ -12,101 +12,5 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function # NOTE: inferencer is moved into fluid.contrib.inferencer.
__all__ = []
import contextlib
from . import core
from . import executor
from . import framework
from . import io
from . import parallel_executor
from . import unique_name
from .trainer import check_and_get_place
__all__ = ['Inferencer', ]
class Inferencer(object):
"""
Inferencer High Level API.
Args:
infer_func (Python func): Infer function that will return predict Variable
param_path (str): The path where the inference model is saved by fluid.io.save_params
place (Place): place to do the inference
parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
Examples:
.. code-block:: python
def inference_program():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
return y_predict
place = fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path="/tmp/model", place=place)
"""
def __init__(self, infer_func, param_path, place=None, parallel=False):
self.param_path = param_path
self.scope = core.Scope()
self.parallel = parallel
self.place = check_and_get_place(place)
self.inference_program = framework.Program()
with framework.program_guard(self.inference_program):
with unique_name.guard():
self.predict_var = infer_func()
with self._prog_and_scope_guard():
# load params from param_path into scope
io.load_params(executor.Executor(self.place), param_path)
if parallel:
with self._prog_and_scope_guard():
self.exe = parallel_executor.ParallelExecutor(
use_cuda=isinstance(self.place, core.CUDAPlace),
loss_name=self.predict_var.name)
else:
self.exe = executor.Executor(self.place)
self.inference_program = self.inference_program.clone(for_test=True)
def infer(self, inputs, return_numpy=True):
"""
Do Inference for Inputs
Args:
inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
return_numpy (bool): transform return value into numpy or not
Returns:
Tensor or Numpy: the predict value of the inference model for the inputs
Examples:
.. code-block:: python
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
"""
if not isinstance(inputs, dict):
raise ValueError(
"inputs should be a map of {'input_name': input_var}")
with self._prog_and_scope_guard():
results = self.exe.run(feed=inputs,
fetch_list=[self.predict_var.name],
return_numpy=return_numpy)
return results
@contextlib.contextmanager
def _prog_and_scope_guard(self):
with framework.program_guard(main_program=self.inference_program):
with executor.scope_guard(self.scope):
yield
...@@ -74,7 +74,7 @@ class Initializer(object): ...@@ -74,7 +74,7 @@ class Initializer(object):
directly, but need to use one of its implementations. directly, but need to use one of its implementations.
""" """
def __init_(self): def __init__(self):
pass pass
def __call__(self, param, block): def __call__(self, param, block):
...@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer): ...@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer):
assert loc is not None assert loc is not None
assert scale is not None assert scale is not None
assert seed is not None assert seed is not None
super(NormalInitializer, self).__init__() super(TruncatedNormalInitializer, self).__init__()
self._mean = loc self._mean = loc
self._std_dev = scale self._std_dev = scale
self._seed = seed self._seed = seed
......
...@@ -27,8 +27,7 @@ from . import core ...@@ -27,8 +27,7 @@ from . import core
__all__ = [ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
'load_persistables', 'save_inference_model', 'load_inference_model', 'load_persistables', 'save_inference_model', 'load_inference_model'
'get_inference_program'
] ]
...@@ -504,23 +503,6 @@ def load_persistables(executor, dirname, main_program=None, filename=None): ...@@ -504,23 +503,6 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
filename=filename) filename=filename)
def get_inference_program(target_vars, main_program=None):
if main_program is None:
main_program = default_main_program()
if not isinstance(target_vars, list):
target_vars = [target_vars]
vars = []
for var in target_vars:
if isinstance(var, Evaluator):
vars.extend(var.states)
vars.extend(var.metrics)
else:
vars.append(var)
pruned_program = main_program._prune(targets=vars)
inference_program = pruned_program._inference_optimize()
return inference_program
def prepend_feed_ops(inference_program, def prepend_feed_ops(inference_program,
feed_target_names, feed_target_names,
feed_holder_name='feed'): feed_holder_name='feed'):
...@@ -618,7 +600,7 @@ def save_inference_model(dirname, ...@@ -618,7 +600,7 @@ def save_inference_model(dirname,
""" """
if isinstance(feeded_var_names, six.string_types): if isinstance(feeded_var_names, six.string_types):
feeded_var_names = [feeded_var_names] feeded_var_names = [feeded_var_names]
else: elif export_for_deployment:
if len(feeded_var_names) > 0: if len(feeded_var_names) > 0:
# TODO(paddle-dev): polish these code blocks # TODO(paddle-dev): polish these code blocks
if not (bool(feeded_var_names) and all( if not (bool(feeded_var_names) and all(
...@@ -628,61 +610,60 @@ def save_inference_model(dirname, ...@@ -628,61 +610,60 @@ def save_inference_model(dirname,
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
target_vars = [target_vars] target_vars = [target_vars]
else: elif export_for_deployment:
if not (bool(target_vars) and all( if not (bool(target_vars) and all(
isinstance(var, Variable) for var in target_vars)): isinstance(var, Variable) for var in target_vars)):
raise ValueError("'target_vars' should be a list of Variable.") raise ValueError("'target_vars' should be a list of Variable.")
if main_program is None: if main_program is None:
main_program = default_main_program() main_program = default_main_program()
copy_program = main_program.clone()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
os.makedirs(dirname) os.makedirs(dirname)
if model_filename is not None:
model_basename = os.path.basename(model_filename)
else:
model_basename = "__model__"
model_basename = os.path.join(dirname, model_basename)
# When export_for_deployment is true, we modify the program online so that # When export_for_deployment is true, we modify the program online so that
# it can only be loaded for inference directly. If it's false, the whole # it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be # original program and related meta are saved so that future usage can be
# more flexible. # more flexible.
if export_for_deployment: if export_for_deployment:
global_block = copy_program.global_block() main_program = main_program.clone()
global_block = main_program.global_block()
for i, op in enumerate(global_block.ops): for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False) op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch": if op.type == "feed" or op.type == "fetch":
global_block._remove_op(i) global_block._remove_op(i)
copy_program.desc.flush() main_program.desc.flush()
pruned_program = copy_program._prune(targets=target_vars) main_program = main_program._prune(targets=target_vars)
saved_program = pruned_program._inference_optimize(prune_read_op=True) main_program = main_program._inference_optimize(prune_read_op=True)
fetch_var_names = [v.name for v in target_vars] fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(saved_program, feeded_var_names) prepend_feed_ops(main_program, feeded_var_names)
append_fetch_ops(saved_program, fetch_var_names) append_fetch_ops(main_program, fetch_var_names)
with open(model_basename, "wb") as f:
f.write(main_program.desc.serialize_to_string())
else: else:
# TODO(panyx0718): Save more information so that it can also be used # TODO(panyx0718): Save more information so that it can also be used
# for training and more flexible post-processing. # for training and more flexible post-processing.
saved_program = copy_program with open(model_basename + ".main_program", "wb") as f:
f.write(main_program.desc.serialize_to_string())
if model_filename is not None:
model_filename = os.path.basename(model_filename)
else:
model_filename = "__model__"
model_filename = os.path.join(dirname, model_filename)
if params_filename is not None: if params_filename is not None:
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename)
with open(model_filename, "wb") as f:
f.write(saved_program.desc.serialize_to_string())
save_persistables(executor, dirname, saved_program, params_filename)
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var): ...@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var):
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes()) new_var.desc.set_dtypes(var.desc.dtypes())
new_var.desc.set_lod_levels(var.desc.lod_levels())
new_var.persistable = True new_var.persistable = True
return new_var return new_var
...@@ -632,6 +633,7 @@ def py_reader(capacity, ...@@ -632,6 +633,7 @@ def py_reader(capacity,
}) })
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.desc.set_lod_levels(lod_levels)
startup_var.persistable = True startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
......
...@@ -23,7 +23,10 @@ from ..proto import framework_pb2 ...@@ -23,7 +23,10 @@ from ..proto import framework_pb2
from ..framework import OpProtoHolder, Variable from ..framework import OpProtoHolder, Variable
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc'] __all__ = [
'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
'templatedoc'
]
def _convert_(name): def _convert_(name):
...@@ -58,7 +61,7 @@ def escape_math(text): ...@@ -58,7 +61,7 @@ def escape_math(text):
_two_dollar_pattern_.sub(r"!!\1!!", text))) _two_dollar_pattern_.sub(r"!!\1!!", text)))
def _generate_doc_string_(op_proto): def _generate_doc_string_(op_proto, additional_args_lines=None):
""" """
Generate docstring by OpProto Generate docstring by OpProto
...@@ -98,6 +101,13 @@ def _generate_doc_string_(op_proto): ...@@ -98,6 +101,13 @@ def _generate_doc_string_(op_proto):
buf.write(escape_math(each_attr.comment)) buf.write(escape_math(each_attr.comment))
buf.write('\n') buf.write('\n')
if additional_args_lines is not None:
for line in additional_args_lines:
line = line.strip()
buf.write(' ')
buf.write(line)
buf.write('\n')
if len(op_proto.outputs) != 0: if len(op_proto.outputs) != 0:
buf.write('\nReturns:\n') buf.write('\nReturns:\n')
buf.write(' ') buf.write(' ')
...@@ -205,6 +215,29 @@ def generate_layer_fn(op_type): ...@@ -205,6 +215,29 @@ def generate_layer_fn(op_type):
return func return func
def generate_layer_fn_noattr(op_type):
"""Register the Python layer for an Operator without Attribute.
Args:
op_type: The name of the operator to be created.
This function takes in the operator type (sigmoid, exp , tanh etc) and
creates the operator functionality.
"""
op_proto = OpProtoHolder.instance().get_op_proto(op_type)
def func(x, name=None):
helper = LayerHelper(op_type, **locals())
output = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
return output
func.__name__ = op_type
func.__doc__ = _generate_doc_string_(op_proto)
return func
def deprecated(func_or_class): def deprecated(func_or_class):
""" """
Deprecated warning decorator. It will result a warning message. Deprecated warning decorator. It will result a warning message.
......
...@@ -27,7 +27,7 @@ from . import nn ...@@ -27,7 +27,7 @@ from . import nn
from . import ops from . import ops
from . import tensor from . import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter from ..framework import default_main_program, Parameter, unique_name
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
...@@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps): ...@@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps):
Returns: Returns:
The decayed learning rate. The decayed learning rate.
""" """
global_step = _decay_step_counter(1) with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter(1)
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
return lr_value return lr_value
...@@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * (decay_rate**div_res) decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr return decayed_lr
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
Returns: Returns:
The decayed learning rate The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
return decayed_lr return decayed_lr
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res)
return decayed_lr return decayed_lr
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
...@@ -220,25 +224,28 @@ def polynomial_decay(learning_rate, ...@@ -220,25 +224,28 @@ def polynomial_decay(learning_rate,
Returns: Returns:
Variable: The decayed learning rate Variable: The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
if cycle:
div_res = ops.ceil(global_step / decay_steps) if cycle:
zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0) div_res = ops.ceil(global_step / decay_steps)
one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0) zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0)
with control_flow.Switch() as switch: one_var = tensor.fill_constant(
with switch.case(global_step == zero_var): shape=[1], dtype='float32', value=1.0)
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res with control_flow.Switch() as switch:
else: with switch.case(global_step == zero_var):
decay_steps_var = tensor.fill_constant( tensor.assign(input=one_var, output=div_res)
shape=[1], dtype='float32', value=float(decay_steps)) decay_steps = decay_steps * div_res
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = nn.elementwise_min(x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate ((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
def piecewise_decay(boundaries, values): def piecewise_decay(boundaries, values):
...@@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values): ...@@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values):
""" """
with default_main_program()._lr_schedule_guard():
if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1")
if len(values) - len(boundaries) != 1: global_step = _decay_step_counter()
raise ValueError("len(values) - len(boundaries) should be 1")
global_step = _decay_step_counter()
lr = tensor.create_global_var( lr = tensor.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="learning_rate") name="learning_rate")
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], shape=[1],
dtype='float32', dtype='float32',
value=float(boundaries[i]), value=float(values[len(values) - 1]))
force_cpu=True) with switch.default():
value_var = tensor.fill_constant( tensor.assign(last_value_var, lr)
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
tensor.assign(last_value_var, lr)
return lr return lr
......
此差异已折叠。
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
from .layer_function_generator import generate_layer_fn from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
__activations__ = [ __activations_noattr__ = [
'sigmoid', 'sigmoid',
'logsigmoid', 'logsigmoid',
'exp', 'exp',
...@@ -33,29 +33,12 @@ __activations__ = [ ...@@ -33,29 +33,12 @@ __activations__ = [
'square', 'square',
'softplus', 'softplus',
'softsign', 'softsign',
'brelu',
'leaky_relu',
'soft_relu',
'elu',
'relu6',
'pow',
'stanh',
'hard_sigmoid',
'swish',
] ]
__all__ = [ __all__ = [
'mean', 'mean',
'mul', 'mul',
'scale',
'sigmoid_cross_entropy_with_logits', 'sigmoid_cross_entropy_with_logits',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'clip', 'clip',
'clip_by_norm', 'clip_by_norm',
'logical_and', 'logical_and',
...@@ -70,11 +53,21 @@ __all__ = [ ...@@ -70,11 +53,21 @@ __all__ = [
'slice', 'slice',
'shape', 'shape',
'maxout', 'maxout',
] + __activations__ ]
for _OP in set(__all__): for _OP in set(__all__):
globals()[_OP] = generate_layer_fn(_OP) globals()[_OP] = generate_layer_fn(_OP)
# It is a hot fix in some unittest using:
# fluid.layers.scale(x=x, scale=10.0, out=out_var)
# e.g.: test_program_code.py, test_dist_train.py
globals()['_scale'] = generate_layer_fn('scale')
__all__ += __activations_noattr__
for _OP in set(__activations_noattr__):
globals()[_OP] = generate_layer_fn_noattr(_OP)
__all__ += ["uniform_random"] __all__ += ["uniform_random"]
_uniform_random_ = generate_layer_fn('uniform_random') _uniform_random_ = generate_layer_fn('uniform_random')
......
...@@ -21,6 +21,7 @@ __all__ = [ ...@@ -21,6 +21,7 @@ __all__ = [
"sequence_conv_pool", "sequence_conv_pool",
"glu", "glu",
"scaled_dot_product_attention", "scaled_dot_product_attention",
"img_conv_group",
] ]
......
...@@ -74,28 +74,7 @@ class ParallelExecutor(object): ...@@ -74,28 +74,7 @@ class ParallelExecutor(object):
build_strategy=None, build_strategy=None,
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
scope=None, scope=None):
**kwargs):
if len(kwargs) != 0:
err_msg = ""
for key in kwargs:
if key in dir(ExecutionStrategy):
err_msg += \
"Setting {0} by constructor is deprecated. Use " \
"strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
"pe=ParallelExecutor(exec_strategy=strategy) " \
"instead.\n ".format(key)
elif key in dir(BuildStrategy):
err_msg += \
"Setting {0} by constructor is deprecated. Use " \
"strategy=BuildStrategy(); See help(" \
"paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
key)
else:
err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
key)
raise ValueError(err_msg)
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
......
...@@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr): ...@@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr):
Args: Args:
dim(list): The parameter's name. Default None. dim(list): The parameter's name. Default None.
kwargs: Any field in ParamAttr. Default None. name(str): The parameter's name. Default None.
initializer(Initializer): The method to initial this parameter. Default None.
learning_rate(float): The parameter's learning rate. The learning rate when
optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
Default 1.0.
regularizer(WeightDecayRegularizer): Regularization factor. Default None.
trainable(bool): Whether this parameter is trainable. Default True.
gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
gradient. Default None.
do_model_average(bool): Whether this parameter should do model average.
Default False.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr): ...@@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr):
# these paramters for inference. # these paramters for inference.
params_with_weight_norm = [] params_with_weight_norm = []
def __init__(self, dim=None, **kwargs): def __init__(self,
super(WeightNormParamAttr, self).__init__(**kwargs) dim=None,
name=None,
initializer=None,
learning_rate=1.0,
regularizer=None,
trainable=True,
gradient_clip=None,
do_model_average=False):
super(WeightNormParamAttr, self).__init__(
name=name,
initializer=initializer,
learning_rate=learning_rate,
regularizer=regularizer,
trainable=trainable,
gradient_clip=gradient_clip,
do_model_average=do_model_average)
self.dim = dim self.dim = dim
...@@ -16,6 +16,16 @@ from __future__ import print_function ...@@ -16,6 +16,16 @@ from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import sys
try:
from paddle.fluid.contrib.trainer import *
from paddle.fluid.contrib.inferencer import *
except ImportError:
print(
"In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
file=sys.stderr)
from paddle.fluid.trainer import *
from paddle.fluid.inferencer import *
import contextlib import contextlib
import numpy import numpy
import unittest import unittest
...@@ -57,11 +67,11 @@ def optimizer_func(): ...@@ -57,11 +67,11 @@ def optimizer_func():
def train(use_cuda, train_program, params_dirname, inference_model_dirname): def train(use_cuda, train_program, params_dirname, inference_model_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func) train_func=train_program, place=place, optimizer_func=optimizer_func)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndStepEvent): if isinstance(event, EndStepEvent):
if event.step == 10: if event.step == 10:
test_metrics = trainer.test( test_metrics = trainer.test(
reader=test_reader, feed_order=['x', 'y']) reader=test_reader, feed_order=['x', 'y'])
...@@ -91,7 +101,7 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -91,7 +101,7 @@ def infer(use_cuda, inference_program, params_dirname=None):
return return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place) infer_func=inference_program, param_path=params_dirname, place=place)
batch_size = 10 batch_size = 10
......
...@@ -14,11 +14,22 @@ ...@@ -14,11 +14,22 @@
from __future__ import print_function from __future__ import print_function
import sys
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
try:
from paddle.fluid.contrib.trainer import *
from paddle.fluid.contrib.inferencer import *
except ImportError:
print(
"In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
file=sys.stderr)
from paddle.fluid.trainer import *
from paddle.fluid.inferencer import *
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
import six
import os import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -106,7 +117,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -106,7 +117,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndStepEvent): if isinstance(event, EndStepEvent):
avg_cost, accuracy = trainer.test( avg_cost, accuracy = trainer.test(
reader=test_reader, feed_order=['pixel', 'label']) reader=test_reader, feed_order=['pixel', 'label'])
...@@ -118,7 +129,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -118,7 +129,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
return return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = Trainer(
train_func=train_program, train_func=train_program,
optimizer_func=optimizer_func, optimizer_func=optimizer_func,
place=place, place=place,
...@@ -133,7 +144,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -133,7 +144,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
def infer(use_cuda, inference_program, parallel, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = Inferencer(
infer_func=inference_program, infer_func=inference_program,
param_path=params_dirname, param_path=params_dirname,
place=place, place=place,
......
...@@ -14,11 +14,22 @@ ...@@ -14,11 +14,22 @@
from __future__ import print_function from __future__ import print_function
import sys
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
try:
from paddle.fluid.contrib.trainer import *
from paddle.fluid.contrib.inferencer import *
except ImportError:
print(
"In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
file=sys.stderr)
from paddle.fluid.trainer import *
from paddle.fluid.inferencer import *
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
import six
import os import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -83,7 +94,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -83,7 +94,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndStepEvent): if isinstance(event, EndStepEvent):
avg_cost, accuracy = trainer.test( avg_cost, accuracy = trainer.test(
reader=test_reader, feed_order=['pixel', 'label']) reader=test_reader, feed_order=['pixel', 'label'])
...@@ -95,7 +106,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -95,7 +106,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
return return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = Trainer(
train_func=train_program, train_func=train_program,
place=place, place=place,
optimizer_func=optimizer_func, optimizer_func=optimizer_func,
...@@ -110,7 +121,7 @@ def train(use_cuda, train_program, parallel, params_dirname): ...@@ -110,7 +121,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
def infer(use_cuda, inference_program, parallel, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = Inferencer(
infer_func=inference_program, infer_func=inference_program,
param_path=params_dirname, param_path=params_dirname,
place=place, place=place,
......
...@@ -16,6 +16,16 @@ from __future__ import print_function ...@@ -16,6 +16,16 @@ from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import sys
try:
from paddle.fluid.contrib.trainer import *
from paddle.fluid.contrib.inferencer import *
except ImportError:
print(
"In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
file=sys.stderr)
from paddle.fluid.trainer import *
from paddle.fluid.inferencer import *
import numpy as np import numpy as np
WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict() WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
...@@ -149,7 +159,7 @@ def optimize_func(): ...@@ -149,7 +159,7 @@ def optimize_func():
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = Trainer(
train_func=train_program, place=place, optimizer_func=optimize_func) train_func=train_program, place=place, optimizer_func=optimize_func)
feed_order = [ feed_order = [
...@@ -164,7 +174,7 @@ def train(use_cuda, train_program, params_dirname): ...@@ -164,7 +174,7 @@ def train(use_cuda, train_program, params_dirname):
# place) # place)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndEpochEvent): if isinstance(event, EndEpochEvent):
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.conll05.test(), batch_size=BATCH_SIZE) paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
avg_cost_set = trainer.test( avg_cost_set = trainer.test(
...@@ -184,7 +194,7 @@ def train(use_cuda, train_program, params_dirname): ...@@ -184,7 +194,7 @@ def train(use_cuda, train_program, params_dirname):
if math.isnan(float(avg_cost)): if math.isnan(float(avg_cost)):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
elif isinstance(event, fluid.EndStepEvent): elif isinstance(event, EndStepEvent):
print("Step {0}, Epoch {1} Metrics {2}".format( print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, list(map(np.array, event.metrics)))) event.step, event.epoch, list(map(np.array, event.metrics))))
if event.step == 1: # Run 2 iterations to speed CI if event.step == 1: # Run 2 iterations to speed CI
...@@ -204,7 +214,7 @@ def train(use_cuda, train_program, params_dirname): ...@@ -204,7 +214,7 @@ def train(use_cuda, train_program, params_dirname):
def infer(use_cuda, inference_program, params_dirname): def infer(use_cuda, inference_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = Inferencer(
inference_program, param_path=params_dirname, place=place) inference_program, param_path=params_dirname, place=place)
# Setup input by creating LoDTensor to represent sequence of words. # Setup input by creating LoDTensor to represent sequence of words.
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册