提交 31e67b90 编写于 作者: S sneaxiy

test=develop

add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis)
DEPENDS paddle_v2_apis)
add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn
paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2)
add_subdirectory(fluid)
add_subdirectory(mobile)
......@@ -153,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn'
paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
......@@ -224,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -269,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
......@@ -298,6 +298,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
......
......@@ -150,11 +150,10 @@ else()
endif()
if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
graph build_strategy
fast_threaded_ssa_graph_executor)
endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto)
......
......@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace details {
class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) {
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
// Add op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path",
new std::string(graph_path));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy_.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
}
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
}
private:
BuildStrategy strategy_;
};
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
const {
pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
return pass_builder_;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else
const bool use_cuda) const {
#endif
// Create a default one if not initialized by user.
if (!pass_builder_) {
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("params");
pass->SetNotOwned<const std::unordered_set<std::string>>("params",
&param_names);
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
}
graph = pass->Apply(std::move(graph));
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
......@@ -15,6 +15,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace framework {
......@@ -57,6 +68,30 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false};
// User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes
// from python side.
// A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(
const ProgramDesc &main_program,
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
#else
const bool use_cuda) const;
#endif
private:
mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
};
} // namespace details
......
......@@ -20,79 +20,37 @@ namespace paddle {
namespace framework {
namespace details {
// Change it to thread safe flags if needed.
class ThreadUnsafeOwnershipFlags {
template <class T>
class COWPtr {
public:
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
typedef std::shared_ptr<T> RefPtr;
private:
bool flag_;
};
RefPtr m_sp;
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
COWPtr() : m_sp(nullptr) {}
explicit COWPtr(T* t) : m_sp(t) {}
// Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
const T& Data() const { return *m_sp; }
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
DetachIfNotUnique();
return m_sp.get();
}
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
void DetachIfNotUnique() {
T* tmp = m_sp.get();
if (!(tmp == nullptr || m_sp.unique())) {
Detach();
}
}
// Ownership flag.
OwnershipFlags ownership_;
void Detach() {
T* tmp = m_sp.get();
m_sp = RefPtr(new T(*tmp));
}
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10);
}
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -41,6 +41,8 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
......
......@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PDPattern external_pattern, subblock_pattern;
// Use the following variables to tell whether this model is RNN1.
// This fuse can only works on the RNN1 model.
std::unordered_set<std::string> specified_vars({"data_lod_attention",
"cell_init", "hidden_init",
"data", "week", "minute"});
int count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsVar() && specified_vars.count(node->Name())) {
++count;
}
}
if (count < specified_vars.size()) {
return graph;
}
// Continue to fuse.
FindWhileOp(graph.get());
return graph;
}
......
......@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
const std::string BatchedCellPreAct =
patterns::UniqueKey("BatchedCellPreAct");
const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {});
......@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetOutput("CheckedCell", {CheckedCell});
op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr
......
......@@ -19,7 +19,6 @@ namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
for (const std::string& attr : required_pass_attrs_) {
PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
......
......@@ -42,6 +42,8 @@ class Pass {
attr_dels_.clear();
}
std::string Type() const { return type_; }
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
// Get a reference to the attributed previously set.
......@@ -52,6 +54,21 @@ class Pass {
return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
}
bool Has(const std::string &attr_name) const {
return attrs_.find(attr_name) != attrs_.end();
}
void Erase(const std::string &attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}
// Set a pointer to the attribute. Pass takes ownership of the attribute.
template <typename AttrType>
void Set(const std::string &attr_name, AttrType *attr) {
......@@ -68,13 +85,15 @@ class Pass {
// should delete the attribute.
template <typename AttrType>
void SetNotOwned(const std::string &attr_name, AttrType *attr) {
PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
attr_name);
attrs_[attr_name] = attr;
}
protected:
virtual std::unique_ptr<Graph> ApplyImpl(
std::unique_ptr<Graph> graph) const = 0;
virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
LOG(FATAL) << "Calling virtual Pass not implemented.";
}
private:
template <typename PassType>
......@@ -89,7 +108,10 @@ class Pass {
required_graph_attrs_.insert(attrs.begin(), attrs.end());
}
void RegisterType(const std::string &type) { type_ = type; }
mutable bool applied_{false};
std::string type_;
std::unordered_set<std::string> required_pass_attrs_;
std::unordered_set<std::string> required_graph_attrs_;
std::map<std::string, boost::any> attrs_;
......@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
"'%s' is registered more than once.", pass_type);
PassRegistry::Instance().Insert(
pass_type, [this]() -> std::unique_ptr<Pass> {
pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
std::unique_ptr<Pass> pass(new PassType());
pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
pass->RegisterType(pass_type);
return pass;
});
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/pass_builder.h"
namespace paddle {
namespace framework {
namespace ir {
std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
auto pass = ir::PassRegistry::Instance().Get(pass_type);
passes_.emplace_back(pass.release());
return passes_.back();
}
void PassBuilder::RemovePass(size_t idx) {
PADDLE_ENFORCE(passes_.size() > idx);
passes_.erase(passes_.begin() + idx);
}
std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
const std::string& pass_type) {
PADDLE_ENFORCE(passes_.size() >= idx);
std::shared_ptr<Pass> pass(
ir::PassRegistry::Instance().Get(pass_type).release());
passes_.insert(passes_.begin() + idx, std::move(pass));
return passes_[idx];
}
} // namespace ir
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class PassBuilder {
public:
PassBuilder() {}
virtual ~PassBuilder() {}
// Append a new pass to the end.
std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
// Insert a new pass after `idx`.
std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
// Remove a new pass at `idx`.
void RemovePass(size_t idx);
// Returns a list of all passes.
std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
protected:
std::vector<std::shared_ptr<Pass>> passes_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
try {
// Allow apply more than once.
graph.reset(new Graph(prog));
graph->Set<int>("test_graph_attr", new int);
graph = pass->Apply(std::move(graph));
} catch (paddle::platform::EnforceNotMet e) {
exception = std::string(e.what());
}
ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
pass = PassRegistry::Instance().Get("test_pass");
pass->SetNotOwned<int>("test_pass_attr", &val);
......
......@@ -17,10 +17,13 @@
#include <algorithm>
#include <initializer_list>
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h"
......@@ -28,173 +31,167 @@ namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
class Vector {
public:
using value_type = T;
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
// Default ctor. Create empty Vector
Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T()) {
InitEmpty();
if (count != 0) {
resize(count);
T *ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
}
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
// Ctor with init_list
Vector(std::initializer_list<T> init) {
if (init.size() == 0) {
InitEmpty();
} else {
InitByIter(init.size(), init.begin(), init.end());
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
// implicit cast from std::vector.
template <typename U>
Vector(const std::vector<U> &dat) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Copy ctor
Vector(const Vector<T> &other) { this->operator=(other); }
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
class Vector {
public:
using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
// Copy operator
Vector<T> &operator=(const Vector<T> &other) {
if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this;
}
private:
// The actual class to implement vector logic
class VectorData {
public:
VectorData() : flag_(kDataInCPU) {}
VectorData(size_t count, const T &value)
: cpu_(count, value), flag_(kDataInCPU) {}
VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
~VectorData() {}
// Move ctor
Vector(Vector<T> &&other) {
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
VectorData(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
}
VectorData &operator=(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
}
// CPU data access method. Mutable.
T &operator[](size_t i) {
MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i];
return cpu_[i];
}
// CPU data access method. Immutable.
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_vec_.data<T>()[i];
return cpu_[i];
}
// std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; }
size_t size() const { return cpu_.size(); }
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
iterator begin() {
MutableCPU();
return cpu_.begin();
}
T *end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
iterator end() {
MutableCPU();
return cpu_.end();
}
T &front() { return *begin(); }
T &front() {
MutableCPU();
return cpu_.front();
}
T &back() {
auto it = end();
--it;
return *it;
MutableCPU();
return cpu_.back();
}
const T *begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
const_iterator begin() const {
ImmutableCPU();
return cpu_.begin();
}
const T *end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
const_iterator end() const {
ImmutableCPU();
return cpu_.end();
}
const T *cbegin() const { return begin(); }
const T *cend() const { return end(); }
const T &back() const {
auto it = end();
--it;
return *it;
ImmutableCPU();
return cpu_.back();
}
T *data() { return begin(); }
T *data() { return &(*this)[0]; }
const T *data() const { return begin(); }
const T *data() const { return &(*this)[0]; }
const T &front() const { return *begin(); }
// end of std::vector iterator methods
const T &front() const {
ImmutableCPU();
return cpu_.front();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
InitByIter(end - begin, begin, end);
MutableCPU();
cpu_.assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
MutableCPU();
cpu_.push_back(elem);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
size_t pre_size = size_;
resize(pre_size + (end - begin));
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
MutableCPU();
auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
std::copy(begin, end, out_it);
}
// resize the vector
void resize(size_t size) {
if (size + 1 <= capacity()) {
size_ = size;
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
}
cpu_.resize(size);
}
// get cuda ptr. immutable
......@@ -202,7 +199,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return cuda_vec_.data<T>();
return reinterpret_cast<T *>(gpu_.data_);
}
// get cuda ptr. mutable
......@@ -214,77 +211,39 @@ class Vector {
// clear
void clear() {
size_ = 0;
cpu_.clear();
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const {
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
size_t capacity() const { return cpu_.capacity(); }
// reserve data
void reserve(size_t size) {
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
return data();
}
}
// the unify method to access CPU or CUDA data. mutable.
T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) {
return CUDAMutableData(place);
} else {
return data();
}
}
void reserve(size_t size) const { cpu_.reserve(size); }
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const {
std::vector<T> result;
result.resize(size());
std::copy(begin(), end(), result.begin());
return result;
ImmutableCPU();
return cpu_;
}
bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false;
auto it1 = cbegin();
auto it2 = other.cbegin();
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}
}
return true;
bool operator==(const VectorData &other) const {
ImmutableCPU();
other.ImmutableCPU();
return cpu_ == other.cpu_;
}
private:
void InitEmpty() {
size_ = 0;
flag_ = kDataInCPU;
}
std::mutex &Mutex() const { return mtx_; }
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
}
flag_ = kDataInCPU | kDirty;
size_ = size;
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) {
return nullptr;
} else {
return std::unique_ptr<platform::CUDAPlace>(
new platform::CUDAPlace(gpu_.place_));
}
}
private:
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
......@@ -294,8 +253,15 @@ class Vector {
void CopyToCPU() const {
// COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
dev_ctx->Wait();
}
void MutableCPU() {
......@@ -308,16 +274,12 @@ class Vector {
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen");
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
......@@ -326,17 +288,10 @@ class Vector {
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen.");
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
......@@ -344,9 +299,20 @@ class Vector {
}
}
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const {
if (IsDirty() &&
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
......@@ -362,23 +328,178 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) {
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
mutable std::mutex mtx_;
};
public:
// Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {}
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T())
: m_(new VectorData(count, value)) {}
// Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
// implicit cast from std::vector.
template <typename U>
Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT
}
// Copy ctor
Vector(const Vector<T> &other) { m_ = other.m_; }
// Copy operator
Vector<T> &operator=(const Vector<T> &other) {
m_ = other.m_;
return *this;
}
// Move ctor
Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
// CPU data access method. Mutable.
T &operator[](size_t i) { return (*m_.MutableData())[i]; }
// CPU data access method. Immutable.
const T &operator[](size_t i) const { return m_.Data()[i]; }
// std::vector iterator methods. Based on CPU data access method
size_t size() const { return m_.Data().size(); }
iterator begin() { return m_.MutableData()->begin(); }
iterator end() { return m_.MutableData()->end(); }
T &front() { return m_.MutableData()->front(); }
T &back() { return m_.MutableData()->back(); }
const_iterator begin() const { return m_.Data().begin(); }
const_iterator end() const { return m_.Data().end(); }
const_iterator cbegin() const { return begin(); }
const_iterator cend() const { return end(); }
const T &back() const { return m_.Data().back(); }
T *data() { return m_.MutableData()->data(); }
const T *data() const { return m_.Data().data(); }
const T &front() const { return m_.Data().front(); }
// end of std::vector iterator methods
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
m_.MutableData()->assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) { m_.MutableData()->push_back(elem); }
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
m_.MutableData()->Extend(begin, end);
}
// resize the vector
void resize(size_t size) {
if (m_.Data().size() != size) {
m_.MutableData()->resize(size);
}
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
{
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAData(place);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
{
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAMutableData(place);
}
// clear
void clear() { m_.MutableData()->clear(); }
size_t capacity() const { return m_.Data().capacity(); }
// reserve data
void reserve(size_t size) { m_.Data().reserve(size); }
// the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
return data();
}
}
// the unify method to access CPU or CUDA data. mutable.
T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance()
.Get(boost::get<platform::CUDAPlace>(place))
->Wait();
return CUDAMutableData(place);
} else {
return data();
}
}
static T &EmptyDummy() {
static T dummy = T();
return dummy;
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { return m_.Data(); }
bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false;
auto it1 = cbegin();
auto it2 = other.cbegin();
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}
}
return true;
}
mutable int flag_;
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_;
size_t size_;
const void *Handle() const { return &m_.Data(); }
private:
// Vector is an COW object.
mutable details::COWPtr<VectorData> m_;
};
#else // PADDLE_WITH_CUDA
......
......@@ -38,31 +38,27 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_;
std::string op_type_;
bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr;
}
const proto::OpProto& Proto() const {
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
op_type_);
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
PADDLE_ENFORCE(proto_->IsInitialized(),
"Operator %s Proto must be initialized in op info",
op_type_);
"Operator Proto must be initialized in op info");
return *proto_;
}
const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL(
creator_, "Operator %s Creator has not been registered", op_type_);
PADDLE_ENFORCE_NOT_NULL(creator_,
"Operator Creator has not been registered");
return creator_;
}
const GradOpMakerFN& GradOpMaker() const {
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
"Operator %s GradOpMaker has not been registered.",
op_type_);
"Operator GradOpMaker has not been registered.");
return grad_op_maker_;
}
......@@ -77,9 +73,8 @@ class OpInfoMap {
return map_.find(op_type) != map_.end();
}
void Insert(const std::string& type, OpInfo info) {
void Insert(const std::string& type, const OpInfo& info) {
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
info.op_type_ = type;
map_.insert({type, info});
}
......
......@@ -19,15 +19,13 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -35,80 +33,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes, const bool use_cuda,
#ifdef PADDLE_WITH_CUDA
const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
#else
const BuildStrategy &strategy) {
#endif
// Convert the program to graph.
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
// Apply op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass =
ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass");
multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
"params", &param_names);
multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
graph = multi_devices_pass->Apply(std::move(graph));
// Apply a graph print pass to record a graph with device info.
if (!strategy.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass =
ir::PassRegistry::Instance().Get("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
graph = multi_devices_print_pass->Apply(std::move(graph));
}
// Verify that the graph is correct for multi-device executor.
auto multi_devices_check_pass =
ir::PassRegistry::Instance().Get("multi_devices_check_pass");
graph = multi_devices_check_pass->Apply(std::move(graph));
return graph;
}
class ParallelExecutorPrivate {
public:
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
......@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, build_strategy,
member_->nccl_ctxs_.get());
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) {
......@@ -228,9 +151,9 @@ ParallelExecutor::ParallelExecutor(
}
}
#else
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, build_strategy);
std::unique_ptr<ir::Graph> graph =
build_strategy.Apply(main_program, member_->places_, loss_var_name,
params, member_->local_scopes_, member_->use_cuda_);
#endif
if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
......@@ -373,12 +296,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework
} // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
#ifdef PADDLE_WITH_CUDA
USE_PASS(reference_count_pass);
#endif
......@@ -14,14 +14,14 @@ limitations under the License. */
#pragma once
#include <paddle/fluid/framework/details/build_strategy.h>
#include <atomic>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
......
......@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
selected_rows_.reset(new SelectedRows(rows, height));
Tensor* value = selected_rows_->mutable_value();
value->mutable_data<float>(
auto* data = value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
for (int64_t i = 0; i < value->numel(); ++i) {
data[i] = static_cast<float>(i);
}
}
protected:
......@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
auto* dst_data = dst_tensor.value().data<float>();
for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
ASSERT_EQ(dst_data[i], static_cast<float>(i));
}
}
TEST(SelectedRows, SparseTable) {
......
......@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser;
analyser.Run(&argument);
}
void TestWord2vecPrediction(const std::string &model_path) {
void TestWord2vecPrediction(const std::string& model_path) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
......@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << "data: "
<< static_cast<float *>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
<< static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]);
}
}
......
......@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
}
}
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
void CreateTrtEngineOp(Node *node, Argument *argument,
framework::proto::BlockDesc *block) {
PADDLE_ENFORCE(argument->main_dfg.get());
const DataFlowGraph &graph = *(argument->main_dfg);
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
......@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
......@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
*block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
CreateTrtEngineOp(node, argument_, block_desc.Proto());
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
......
......@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) {
if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
continue;
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/node.h"
......@@ -63,8 +64,11 @@ class SubGraphFuse {
public:
using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
: graph_(graph), node_inside_subgraph_teller_(teller) {}
SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
Argument *argument)
: graph_(graph),
node_inside_subgraph_teller_(teller),
argument_(argument) {}
// The main method which run all the logic.
void operator()();
......@@ -76,6 +80,7 @@ class SubGraphFuse {
private:
DataFlowGraph *graph_;
NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument *argument_;
};
} // namespace analysis
......
......@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
TEST(SubGraphSplitter, Fuse) {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc);
Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(3));
size_t count0 = dfg.nodes.size();
SubGraphFuse fuse(&dfg, teller);
SubGraphFuse fuse(&dfg, teller, &argument);
fuse();
int count1 = 0;
......
......@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
: node_inside_subgraph_teller_(teller) {}
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)();
SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
......
......@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; }
bool Initialize(Argument* argument) override {
argument_ = argument;
return true;
}
// This class get a sub-graph as input and determine whether to transform this
// sub-graph into TensorRT.
......@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_;
Argument* argument_;
};
} // namespace analysis
......
......@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
};
Argument argument(FLAGS_inference_model_dir);
argument.Set<int>("minimum_subgraph_size", new int(0));
argument.Set<int>("max_batch_size", new int(3));
argument.Set<int>("workspace_size", new int(1 << 20));
argument.Set<std::string>("precision_mode", new std::string("FP32"));
DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
......
......@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size;
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
......@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void OptimizeInferenceProgram() {
// Analyze inference_program
Argument argument;
argument.Set<int>("minimum_subgraph_size",
new int(config_.minimum_subgraph_size));
argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
argument.Set<int>("workspace_size", new int(config_.workspace_size));
argument.Set<std::string>("precision_mode",
new std::string(config_.precision_mode));
if (!config_.model_dir.empty()) {
argument.fluid_model_dir.reset(new std::string(config_.model_dir));
} else {
......
......@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int workspace_size{1 << 30};
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int minimum_subgraph_size = 3;
// Reserved configuration
// We just support "FP32" now, "FP16" and "INT8" will be supported.
std::string precision_mode = "FP32";
};
// NOTE WIP, not stable yet.
......@@ -204,10 +212,11 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`.
};
// Determine whether to perform graph optimization.
bool enable_ir_optim = true;
// Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
std::vector<std::string> ir_passes;
// NOTE this is just for internal development, please not use it.
bool _use_mkldnn{false};
......
......@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
endif()
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
cc_test(test_trt_models SRCS trt_models_tester.cc
ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
DEPS paddle_inference_tensorrt_subgraph_engine)
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
using paddle::contrib::MixedRTConfig;
DEFINE_string(dirname, "", "Directory of the inference model.");
NativeConfig GetConfigNative() {
NativeConfig config;
config.model_dir = FLAGS_dirname;
// LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.45;
config.use_gpu = true;
config.device = 0;
return config;
}
MixedRTConfig GetConfigTRT() {
MixedRTConfig config;
config.model_dir = FLAGS_dirname;
config.use_gpu = true;
config.fraction_of_gpu_memory = 0.2;
config.device = 0;
config.max_batch_size = 3;
return config;
}
void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
NativeConfig config0 = GetConfigNative();
config0.model_dir = model_dirname;
MixedRTConfig config1 = GetConfigTRT();
config1.model_dir = model_dirname;
config1.max_batch_size = batch_size;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
// Prepare inputs
int height = 224;
int width = 224;
float *data = new float[batch_size * 3 * height * width];
memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
data[0] = 1.0f;
// Prepare inputs
PaddleTensor tensor;
tensor.name = "input_0";
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data = PaddleBuf(static_cast<void *>(data),
sizeof(float) * (batch_size * 3 * height * width));
tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
// Prepare outputs
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
// Get output.
ASSERT_EQ(outputs0.size(), 1UL);
ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
}
}
TEST(trt_models_test, main) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50",
"resnext50"};
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
}
}
} // namespace paddle
......@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
"Out and Label should have same height.");
int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
int slide_steps = ctx->Attrs().Get<int>("slide_steps");
PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
slide_steps = slide_steps == 0 ? 1 : slide_steps;
ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
}
protected:
......@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label",
"A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]");
// TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0");
......@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("AUC",
"A scalar representing the "
"current area-under-the-curve.");
AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("StatNegOut", "Statistic value when label = 0");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC");
AddAttr<int>("num_thresholds",
"The number of thresholds to use when discretizing the"
" roc curve.")
AddAttr<int>(
"num_thresholds",
"The number of thresholds to use when discretizing the roc curve.")
.SetDefault((2 << 12) - 1);
AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
.SetDefault(1);
AddComment(R"DOC(
Area Under The Curve (AUC) Operator.
......
......@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds");
// buckets contain numbers from 0 to num_thresholds
int num_pred_buckets = num_thresholds + 1;
int slide_steps = ctx.Attr<int>("slide_steps");
// Only use output var for now, make sure it's persistable and
// not cleaned up for each batch.
......@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
num_thresholds, batch_auc);
std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
auto stat_pos_calc = stat_pos_data.data();
auto stat_neg_calc = stat_neg_data.data();
statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
}
private:
......@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
const framework::Tensor *label,
inline static void statAuc(const framework::Tensor *label,
const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
const int num_pred_buckets,
const int num_thresholds, const int slide_steps,
int64_t *origin_stat_pos, int64_t *origin_stat_neg,
int64_t **stat_pos, int64_t **stat_neg) {
size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>();
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
for (size_t i = 0; i < batch_size; i++) {
uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds);
if (label_data[i]) {
stat_pos[binIdx] += 1.0;
(*stat_pos)[binIdx] += 1.0;
} else {
(*stat_neg)[binIdx] += 1.0;
}
}
int bucket_length = num_pred_buckets * sizeof(int64_t);
// will stat auc unlimited.
if (slide_steps == 0) {
for (int slide = 0; slide < num_pred_buckets; ++slide) {
origin_stat_pos[slide] += (*stat_pos)[slide];
origin_stat_neg[slide] += (*stat_neg)[slide];
}
*stat_pos = origin_stat_pos;
*stat_neg = origin_stat_neg;
} else {
stat_neg[binIdx] += 1.0;
for (int slide = 1; slide < slide_steps; ++slide) {
int dst_idx = (slide - 1) * num_pred_buckets;
int src_inx = slide * num_pred_buckets;
std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
bucket_length);
std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
bucket_length);
}
std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
*stat_pos, bucket_length);
std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
*stat_neg, bucket_length);
std::memset(*stat_pos, 0, bucket_length);
std::memset(*stat_neg, 0, bucket_length);
for (int slide = 0; slide < num_pred_buckets; ++slide) {
int stat_pos_steps = 0;
int stat_neg_steps = 0;
for (int step = 0; step < slide_steps; ++step) {
stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
}
(*stat_pos)[slide] += stat_pos_steps;
(*stat_neg)[slide] += stat_neg_steps;
}
}
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
*auc = 0.0f;
double totPos = 0.0;
......@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
totPos += stat_pos[idx];
totNeg += stat_neg[idx];
*auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
--idx;
}
......
......@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
platform::CPUPlace());
ctx.device_context());
}
};
......@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1});
keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
......@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("post_nms_topN", "post_nms_topN");
AddAttr<float>("nms_thresh", "nms_thres");
AddAttr<float>("min_size", "min size");
AddAttr<float>("eta", "eta");
AddAttr<float>("eta", "The parameter for adaptive NMS.");
AddComment(R"DOC(
Generate Proposals OP
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
namespace {
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const kThreadsPerBlock = sizeof(uint64_t) * 8;
template <typename T>
__global__ void RangeInitKernel(const T start, const T delta, const int size,
T *out) {
CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
}
template <typename T>
void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
Tensor *value_out, Tensor *index_out) {
int num = value.numel();
Tensor index_in_t;
int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
const T *keys_in = value.data<T>();
T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
memory::Free(place, d_temp_storage);
}
template <typename T>
__device__ __forceinline__ T Min(T x, T y) {
return x < y ? x : y;
}
template <typename T>
__device__ __forceinline__ T Max(T x, T y) {
return x > y ? x : y;
}
template <typename T>
__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
const T *var, const int *index,
const T *im_info, const int num,
T *proposals) {
T kBBoxClipDefault = log(1000.0 / 16.0);
CUDA_1D_KERNEL_LOOP(i, num) {
int k = index[i] * 4;
T axmin = anchor[k];
T aymin = anchor[k + 1];
T axmax = anchor[k + 2];
T aymax = anchor[k + 3];
T w = axmax - axmin + 1.0;
T h = aymax - aymin + 1.0;
T cx = axmin + 0.5 * w;
T cy = aymin + 0.5 * h;
T dxmin = deltas[k];
T dymin = deltas[k + 1];
T dxmax = deltas[k + 2];
T dymax = deltas[k + 3];
T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
if (var) {
d_cx = cx + dxmin * w * var[k];
d_cy = cy + dymin * h * var[k + 1];
d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
} else {
d_cx = cx + dxmin * w;
d_cy = cy + dymin * h;
d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
}
T oxmin = d_cx - d_w * 0.5;
T oymin = d_cy - d_h * 0.5;
T oxmax = d_cx + d_w * 0.5 - 1.;
T oymax = d_cy + d_h * 0.5 - 1.;
proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
}
}
template <typename T, int BlockSize>
__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
const T min_size, const int num, int *keep_num,
int *keep) {
T im_h = im_info[0];
T im_w = im_info[1];
T im_scale = im_info[2];
int cnt = 0;
__shared__ int keep_index[BlockSize];
CUDA_1D_KERNEL_LOOP(i, num) {
keep_index[threadIdx.x] = -1;
__syncthreads();
int k = i * 4;
T xmin = bboxes[k];
T ymin = bboxes[k + 1];
T xmax = bboxes[k + 2];
T ymax = bboxes[k + 3];
T w = xmax - xmin + 1.0;
T h = ymax - ymin + 1.0;
T cx = xmin + w / 2.;
T cy = ymin + h / 2.;
T w_s = (xmax - xmin) / im_scale + 1.;
T h_s = (ymax - ymin) / im_scale + 1.;
if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
keep_index[threadIdx.x] = i;
}
__syncthreads();
if (threadIdx.x == 0) {
int size = (num - i) < BlockSize ? num - i : BlockSize;
for (int j = 0; j < size; ++j) {
if (keep_index[j] > -1) {
keep[cnt++] = keep_index[j];
}
}
}
__syncthreads();
}
if (threadIdx.x == 0) {
keep_num[0] = cnt;
}
}
__device__ inline float IoU(const float *a, const float *b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float inter_s = width * height;
float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return inter_s / (s_a + s_b - inter_s);
}
__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, uint64_t *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int row_size =
min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
const int col_size =
min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
__shared__ float block_boxes[kThreadsPerBlock * 4];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 4 + 0] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
block_boxes[threadIdx.x * 4 + 1] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
block_boxes[threadIdx.x * 4 + 2] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
block_boxes[threadIdx.x * 4 + 3] =
dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
uint64_t t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
template <typename T>
void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const Tensor &sorted_indices, const T nms_threshold,
Tensor *keep_out) {
int boxes_num = proposals.dims()[0];
PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
DIVUP(boxes_num, kThreadsPerBlock));
dim3 threads(kThreadsPerBlock);
const T *boxes = proposals.data<T>();
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
uint64_t *d_mask =
reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
uint64_t *h_mask = reinterpret_cast<uint64_t *>(
memory::Alloc(platform::CPUPlace(), size_bytes));
memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<int> keep_vec;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / kThreadsPerBlock;
int inblock = i % kThreadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
++num_to_keep;
keep_vec.push_back(i);
uint64_t *p = &h_mask[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
sizeof(int) * num_to_keep, 0);
memory::Free(place, d_mask);
memory::Free(platform::CPUPlace(), h_mask);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const platform::CUDADeviceContext &ctx, const Tensor &im_info,
const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas, // [M, 4]
const Tensor &scores, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
// 1. pre nms
Tensor scores_sort, index_sort;
SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
int num = scores.numel();
int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
: pre_nms_top_n;
scores_sort.Resize({pre_nms_num, 1});
index_sort.Resize({pre_nms_num, 1});
// 2. box decode and clipping
Tensor proposals;
proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream();
BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
proposals.data<T>());
// 3. filter
Tensor keep_index, keep_num_t;
keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
min_size = std::max(min_size, 1.0f);
FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
keep_num_t.data<int>(), keep_index.data<int>());
int keep_num;
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
keep_num_t.data<int>(), sizeof(int), 0);
keep_index.Resize({keep_num});
Tensor scores_filter, proposals_filter;
proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(proposals_filter, scores_filter);
}
// 4. nms
Tensor keep_nms;
NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
Tensor scores_nms, proposals_nms;
proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
return std::make_pair(proposals_nms, scores_nms);
}
} // namespace
template <typename DeviceContext, typename T>
class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *scores = context.Input<Tensor>("Scores");
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
auto *im_info = context.Input<Tensor>("ImInfo");
auto *anchors = context.Input<Tensor>("Anchors");
auto *variances = context.Input<Tensor>("Variances");
auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
int post_nms_top_n = context.Attr<int>("post_nms_topN");
float nms_thresh = context.Attr<float>("nms_thresh");
float min_size = context.Attr<float>("min_size");
float eta = context.Attr<float>("eta");
PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
auto &dev_ctx = context.template device_context<DeviceContext>();
auto scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
dev_ctx.GetPlace());
scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
dev_ctx.GetPlace());
math::Transpose<DeviceContext, T, 4> trans;
std::vector<int> axis = {0, 2, 3, 1};
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
trans(dev_ctx, *scores, &scores_swap, axis);
Tensor *anchor = const_cast<framework::Tensor *>(anchors);
anchor->Resize({anchors->numel() / 4, 4});
Tensor *var = const_cast<framework::Tensor *>(variances);
var->Resize({var->numel() / 4, 4});
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
T *rpn_rois_data = rpn_rois->data<T>();
T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
int64_t num_proposals = 0;
std::vector<size_t> offset(1, 0);
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> box_score_pair =
ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
bbox_deltas_slice, scores_slice, pre_nms_top_n,
post_nms_top_n, nms_thresh, min_size, eta);
Tensor proposals = box_score_pair.first;
Tensor scores = box_score_pair.second;
memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
scores.data<T>(), sizeof(T) * scores.numel(), 0);
num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals);
}
framework::LoD lod;
lod.emplace_back(offset);
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(generate_proposals,
ops::CUDAGenerateProposalsKernel<
paddle::platform::CUDADeviceContext, float>);
......@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod();
auto detect_lod = in_detect->lod();
auto& label_lod = in_label->lod();
auto& detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
......@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto label_lod = input_label.lod();
auto detect_lod = input_detect.lod();
auto& label_lod = input_label.lod();
auto& detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1;
auto label_index = label_lod[0];
auto& label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes;
......@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod);
return;
}
void GetInputPos(const framework::Tensor& input_pos_count,
......@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod()[0];
auto& pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2];
......@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n];
for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
auto& image_gt_boxes = gt_boxes[n];
for (auto& image_gt_box : image_gt_boxes) {
size_t count = 0;
auto labeled_bboxes = it->second;
auto& labeled_bboxes = image_gt_box.second;
if (evaluate_difficult) {
count = labeled_bboxes.size();
} else {
for (size_t i = 0; i < labeled_bboxes.size(); ++i)
if (!(labeled_bboxes[i].is_difficult)) ++count;
for (auto& box : labeled_bboxes) {
if (!box.is_difficult) {
++count;
}
}
}
if (count == 0) {
continue;
}
int label = it->first;
int label = image_gt_box.first;
if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count;
} else {
......
......@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows();
auto &in_rows = in.rows();
auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
......@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE_EQ(
b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
if (ctx->Attrs().Get<bool>("use_peepholes")) {
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection",
frame_size);
ctx->SetOutputDim("CheckedCell", {2, frame_size});
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection or"
"4 * %d if disable peepholes",
frame_size, frame_size);
frame_size);
}
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
......@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
.AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
......@@ -257,12 +265,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
/* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/ \
Tensor checked_cell; \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
}
/// Compute LSTM
......
......@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
ids_data, ids_num * sizeof(int64_t), stream);
memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value();
......
......@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy(
boost::get<platform::CUDAPlace>(out_place), out_data,
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
in1_value.numel() * sizeof(T), context.stream());
auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place),
......@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
framework::Vector<int64_t> in1_rows(input1.rows());
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value();
......
......@@ -45,12 +45,10 @@ class ReadInferVarType : public framework::VarTypeInference {
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
auto lod_levels = reader->GetLoDLevels();
for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetDataType(dtypes[i]);
out.SetLoDLevel(lod_levels[i]);
}
}
};
......
......@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
SamplingId Operator.
A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].")
AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
.SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].")
AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
.SetDefault(1.0f);
AddAttr<int>("seed",
AddAttr<int>(
"seed",
"Random seed used for the random number engine. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. [default 0].")
"generate the same random numbers every time. (int, default 0).")
.SetDefault(0);
}
};
......
......@@ -77,9 +77,11 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name);
if (in_var_name != out_var_name) {
out_var->SetType(in_var.GetType());
out_var->SetDataType(in_var.GetDataType());
}
}
};
class ScaleGradMaker : public framework::SingleGradOpDescMaker {
......
......@@ -88,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value();
framework::Vector<int64_t> in_rows(grad->rows());
auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
......
......@@ -52,26 +52,16 @@ class ShrinkRNNMemoryOp : public ArrayOp {
size_t height = dst_num_rows;
// do shrink for the top level LoD
if (x_tensor.lod().size() > 0 &&
x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
if (x_tensor.lod().size() > 1) { // MultiLevel LoD
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
x_tensor.lod(), 0, dst_num_rows, 0);
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
dst_num_rows, 0);
height = lod_offset.second.second;
auto out_lod = out_tensor.mutable_lod();
framework::AppendLoD(out_lod, lod_offset.first);
} else {
// Shrink LoD
auto lod_item = x_tensor.lod()[0];
lod_item.resize(dst_num_rows + 1);
out_tensor.set_lod({lod_item});
const auto &const_lod_item = lod_item;
height = const_lod_item.back();
}
}
if (height != 0) {
if (dst_num_rows != 0) {
out_tensor.mutable_data(place, x_tensor.type());
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
......@@ -144,11 +134,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
} else {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0];
if (height != 0) {
auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
&slice);
}
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
......
......@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X");
int N = in_vars.size();
size_t in_num = in_vars.size();
auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0];
......@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
auto &place =
*context.template device_context<DeviceContext>().eigen_device();
// If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) {
for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) {
......@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape
size_t first_dim = 0;
for (int i = 0; i < N; i++) {
for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
}
std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) {
for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims());
......@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
}
if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
in_dim =
framework::vectorize(get_selected_row(in_num - 1).value().dims());
} else {
in_dim[0] = static_cast<int64_t>(first_dim);
}
out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to
// merge these vars.
if (first_dim == 0UL) {
......@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0;
for (int i = 0; i < N; i++) {
for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) {
continue;
......
......@@ -22,8 +22,6 @@
namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
namespace operators {
......@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
AddAttr<int>("max_batch_size", "the maximum batch size.");
AddAttr<int>("workspace_size", "the workspace size.");
AddComment("TensorRT engine operator.");
}
};
......
......@@ -28,8 +28,6 @@
namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
DECLARE_int32(tensorrt_max_batch_size);
DECLARE_int32(tensorrt_workspace_size);
namespace operators {
......@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto engine_name = context.Attr<std::string>("engine_uniq_key");
int max_batch_size = context.Attr<int>("max_batch_size");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context);
}
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
FLAGS_tensorrt_max_batch_size);
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
......@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch = FLAGS_tensorrt_max_batch_size;
auto max_workspace = FLAGS_tensorrt_workspace_size;
int max_batch_size = context.Attr<int>("max_batch_size");
int workspace_size = context.Attr<int>("workspace_size");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
......@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/,
max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device);
......
......@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) {
FLAGS_tensorrt_engine_batch_size = 2;
FLAGS_tensorrt_max_batch_size = 2;
framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
......@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
std::vector<std::string>({}));
......@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
}
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
FLAGS_tensorrt_engine_batch_size = batch_size;
FLAGS_tensorrt_max_batch_size = batch_size;
framework::ProgramDesc program;
framework::Scope scope;
platform::CUDAPlace place;
......@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
SetAttr<std::vector<std::string>>(
engine_op_desc.Proto(), "parameters",
std::vector<std::string>({"y0", "y1", "y2", "y3"}));
......
......@@ -201,7 +201,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
grid_max_dims_ = GpuMaxGridDim(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
......@@ -240,10 +239,6 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp;
}
std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
return grid_max_dims_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
}
......
......@@ -13,7 +13,6 @@ limitations under the License. */
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>
......@@ -92,8 +91,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
std::tuple<int, int, int> GetMaxGridDims() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
......@@ -138,8 +135,6 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_;
cublasHandle_t cublas_handle_;
std::tuple<int, int, int> grid_max_dims_;
int compute_capability;
int multi_process;
int max_threads_per_mp;
......
......@@ -48,54 +48,35 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
}
template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
__global__ static void ForRangeElemwiseOp(Function func, int limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) {
func(idx);
}
}
template <typename Function>
__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
int grid_dim) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
while (idx < limit) {
func(idx);
idx += grid_dim;
}
}
template <>
struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(limit) {}
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
int grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size < max_grid_dim) {
int grid_size_int = static_cast<int>(grid_size);
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
} else {
ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
dev_ctx_.stream()>>>(func, limit_,
max_grid_dim);
}
}
const CUDADeviceContext& dev_ctx_;
size_t limit_;
int limit_;
};
#endif
......
......@@ -152,22 +152,5 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
}
std::tuple<int, int, int> GpuMaxGridDim(int id) {
std::tuple<int, int, int> result;
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
return result;
}
} // namespace platform
} // namespace paddle
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <cuda_runtime.h>
#include <stddef.h>
#include <string>
#include <tuple>
namespace paddle {
namespace platform {
......@@ -73,8 +72,6 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
std::tuple<int, int, int> GpuMaxGridDim(int id);
} // namespace platform
} // namespace paddle
......
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method)
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
if(NOT WIN32)
list(APPEND PYBIND_DEPS parallel_executor profiler)
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
......@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler);
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init())
.def("set_str", [](ir::Pass &self, const std::string &name,
const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
});
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder");
pb.def(py::init())
.def("append_pass",
[](ir::PassBuilder &self,
const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
return self.AppendPass(pass_type);
})
.def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
.def("insert_pass",
[](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
return self.InsertPass(idx, pass_type);
})
.def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
// -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
......@@ -677,6 +701,10 @@ All parameter, weight, gradient are variables in Paddle.
},
[](BuildStrategy &self, bool b) {
self.fuse_elewise_add_act_ops_ = b;
})
.def("_create_passes_from_strategy",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy();
});
pe.def(py::init<const std::vector<platform::Place> &,
......
......@@ -70,8 +70,8 @@ function cmake_gen() {
PYTHON_FLAGS=""
SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ]; then
echo "Using python abi: $1"
if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
echo "using python abi: $1"
if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
......@@ -82,7 +82,18 @@ function cmake_gen() {
else
exit 1
fi
# TODO: qiyang add python3 part here
elif [ "$1" == "cp35-cp35m" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
else
exit 1
fi
fi
else
if [ "$1" != "" ]; then
......@@ -629,10 +640,10 @@ EOF
function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="${PADDLE_ROOT}/build/capi_output"
rm -rf $install_prefix
make DESTDIR="$install_prefix" install
cd $install_prefix/usr/local
capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
rm -rf $capi_install_prefix
make DESTDIR="$capi_install_prefix" install
cd $capi_install_prefix/
ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
fi
}
......
......@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename):
print("file md5", md5file(filename), md5sum)
sys.stderr.write("file %s md5 %s" % (md5file(filename), md5sum))
if retry < retry_limit:
retry += 1
else:
raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit))
print("Cache file %s not found, downloading %s" % (filename, url))
sys.stderr.write("Cache file %s not found, downloading %s" %
(filename, url))
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
......@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done,
sys.stderr.write("\r[%s%s]" % ('=' * done,
' ' * (50 - done)))
sys.stdout.flush()
sys.stderr.write("\n")
sys.stdout.flush()
return filename
......
......@@ -18,5 +18,10 @@ from . import decoder
from .decoder import *
from . import memory_usage_calc
from .memory_usage_calc import *
from . import op_frequence
from .op_frequence import *
__all__ = decoder.__all__ + memory_usage_calc.__all__
__all__ = []
__all__ += decoder.__all__
__all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from collections import OrderedDict
from ..framework import Program
__all__ = ['op_freq_statistic']
def op_freq_statistic(program):
"""
Statistics of Op frequency.
Args:
program(Program): The current Program.
Returns:
uni_op_freq(dict): the single op frequency.
adj_2_op_freq(dict): the two adjacent ops frequency.
Examples:
>>> import paddle.fluid as fluid
>>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
>>> fluid.default_main_program())
>>> for op_type, op_num in uni_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
>>> for op_type, op_num in adj_2_op_freq:
>>> print("%s \t %d" % (op_type, op_num))
"""
if not isinstance(program, Program):
raise TypeError("The input type should be Porgram."
"But you passed in %s" % (type(program)))
uni_op_freq = OrderedDict()
adj_2_op_freq = OrderedDict()
op_in_ops = OrderedDict()
parameters = [p.name for p in program.blocks[0].all_parameters()]
# get uni_op_freq
for op in program.global_block().ops:
had_recorded = False
for var_name in op.output_arg_names:
if var_name in parameters:
continue
if not had_recorded and uni_op_freq.has_key(op.type):
uni_op_freq[op.type] += 1
had_recorded = True
elif not had_recorded:
uni_op_freq[op.type] = 1
had_recorded = True
# get adj_2_op_freq
var_gen_op = {}
for op in program.global_block().ops:
for var_name in op.input_arg_names:
if var_name in parameters:
continue
if var_gen_op.has_key(var_name):
assert len(var_gen_op[var_name]) > 0
if op_in_ops.has_key(op.type):
op_in_ops[op.type].append(var_gen_op[var_name][-1])
else:
op_in_ops[op.type] = [var_gen_op[var_name][-1]]
else:
print("Var's generate op is not found,%s, %s" %
(var_name, op.type))
for var_name in op.output_arg_names:
if var_gen_op.has_key(var_name):
var_gen_op[var_name].append(op.type)
else:
var_gen_op[var_name] = [op.type]
for op, in_ops in op_in_ops.iteritems():
for in_op in in_ops:
op_op = in_op + "->" + op
if adj_2_op_freq.has_key(op_op):
adj_2_op_freq[op_op] += 1
else:
adj_2_op_freq[op_op] = 1
uni_op_freq = sorted(
uni_op_freq.items(), key=lambda item: item[1], reverse=True)
adj_2_op_freq = sorted(
adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
return uni_op_freq, adj_2_op_freq
......@@ -284,7 +284,7 @@ def detection_output(loc,
target_box=loc,
code_type='decode_center_size')
compile_shape = scores.shape
run_shape = ops.shape(scores)
run_shape = nn.shape(scores)
scores = nn.flatten(x=scores, axis=2)
scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
......@@ -697,7 +697,7 @@ def ssd_loss(location,
raise ValueError("Only support mining_type == max_negative now.")
num, num_prior, num_class = confidence.shape
conf_shape = ops.shape(confidence)
conf_shape = nn.shape(confidence)
def __reshape_to_2d(var):
return nn.flatten(x=var, axis=2)
......@@ -724,7 +724,7 @@ def ssd_loss(location,
target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples
actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2])
actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
actual_shape.stop_gradient = True
conf_loss = nn.reshape(
x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
......
......@@ -311,7 +311,6 @@ def _copy_reader_var_(block, var):
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes())
new_var.desc.set_lod_levels(var.desc.lod_levels())
new_var.persistable = True
return new_var
......@@ -633,7 +632,6 @@ def py_reader(capacity,
})
startup_var.desc.set_dtypes(dtypes)
startup_var.desc.set_lod_levels(lod_levels)
startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(),
......
......@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out
def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
def auc(input,
label,
curve='ROC',
num_thresholds=2**12 - 1,
topk=1,
slide_steps=1):
"""
**Area Under the Curve (AUC) Layer**
......@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
num_thresholds(int): The number of thresholds to use when discretizing
the roc curve. Default 200.
topk(int): only topk number of prediction output will be used for auc.
slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
Returns:
Variable: A scalar representing the current AUC.
......@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches.
# for batch auc
batch_stat_pos = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
batch_stat_neg = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[slide_steps, num_thresholds + 1])
# for global auc
stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1])
persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds + 1])
persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
for var in [stat_pos, stat_neg]:
for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
helper.set_variable_initializer(
var, Constant(
value=0.0, force_cpu=True))
# Batch AUC
helper.append_op(
type="auc",
inputs={
"Predict": [input],
"Label": [label],
"StatPos": [batch_stat_pos],
"StatNeg": [batch_stat_neg]
},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": slide_steps
},
outputs={
"AUC": [batch_auc_out],
"StatPosOut": [batch_stat_pos],
"StatNegOut": [batch_stat_neg]
})
# Global AUC
helper.append_op(
type="auc",
inputs={
......@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
"StatPos": [stat_pos],
"StatNeg": [stat_neg]
},
attrs={"curve": curve,
"num_thresholds": num_thresholds},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": 0
},
outputs={
"AUC": [auc_out],
"BatchAUC": [batch_auc_out],
"StatPosOut": [stat_pos],
"StatNegOut": [stat_neg]
})
return auc_out, batch_auc_out, [stat_pos, stat_neg]
return auc_out, batch_auc_out, [
batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
]
......@@ -29,110 +29,29 @@ from .. import unique_name
from functools import reduce
__all__ = [
'fc',
'embedding',
'dynamic_lstm',
'dynamic_lstmp',
'dynamic_gru',
'gru_unit',
'linear_chain_crf',
'crf_decoding',
'cos_sim',
'cross_entropy',
'square_error_cost',
'chunk_eval',
'sequence_conv',
'conv2d',
'conv3d',
'sequence_pool',
'sequence_softmax',
'softmax',
'pool2d',
'pool3d',
'batch_norm',
'beam_search_decode',
'conv2d_transpose',
'conv3d_transpose',
'sequence_expand',
'sequence_expand_as',
'sequence_pad',
'lstm_unit',
'reduce_sum',
'reduce_mean',
'reduce_max',
'reduce_min',
'reduce_prod',
'sequence_first_step',
'sequence_last_step',
'dropout',
'split',
'ctc_greedy_decoder',
'edit_distance',
'l2_normalize',
'matmul',
'topk',
'warpctc',
'sequence_reshape',
'transpose',
'im2sequence',
'nce',
'hsigmoid',
'beam_search',
'row_conv',
'multiplex',
'layer_norm',
'softmax_with_cross_entropy',
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
'reshape',
'squeeze',
'unsqueeze',
'lod_reset',
'lrn',
'pad',
'pad_constant_like',
'label_smooth',
'roi_pool',
'dice_loss',
'image_resize',
'image_resize_short',
'resize_bilinear',
'gather',
'scatter',
'sequence_scatter',
'random_crop',
'mean_iou',
'relu',
'log',
'crop',
'rank_loss',
'elu',
'relu6',
'pow',
'stanh',
'hard_sigmoid',
'swish',
'prelu',
'brelu',
'leaky_relu',
'soft_relu',
'flatten',
'sequence_mask',
'stack',
'pad2d',
'unstack',
'sequence_enumerate',
'expand',
'sequence_concat',
'scale',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit',
'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze',
'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool',
'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear',
'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu',
'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid',
'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten',
'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape'
]
......@@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None):
return out
from paddle.fluid.framework import convert_np_dtype_to_dtype_
@templatedoc()
def uniform_random_batch_size_like(input,
shape,
dtype='float32',
input_dim_idx=0,
output_dim_idx=0,
min=-1.0,
max=1.0,
seed=0):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('uniform_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='uniform_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'min': min,
'max': max,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def gaussian_random(shape,
mean=0.0,
std=1.0,
seed=0,
dtype='float32',
use_mkldnn=False):
"""
${comment}
Args:
shape (tuple|list): ${shape_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
use_mkldnn (Bool): Only used in mkldnn kernel.
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random',
outputs={'Out': out},
attrs={
'shape': shape,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype,
'use_mkldnn': use_mkldnn
})
return out
@templatedoc()
def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
"""
${comment}
Args:
x (Variable): ${x_comment}
min (Float): ${min_comment}
max (Float): ${max_comment}
seed (Float): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sampling_id', **locals())
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='sampling_id',
inputs={'X': x},
outputs={'Out': out},
attrs={'min': min,
'max': max,
'seed': seed})
return out
@templatedoc()
def gaussian_random_batch_size_like(input,
shape,
input_dim_idx=0,
output_dim_idx=0,
mean=0.0,
std=1.0,
seed=0,
dtype='float32'):
"""
${comment}
Args:
input (Variable): ${input_comment}
shape (tuple|list): ${shape_comment}
input_dim_idx (Int): ${input_dim_idx_comment}
output_dim_idx (Int): ${output_dim_idx_comment}
mean (Float): ${mean_comment}
std (Float): ${std_comment}
seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('gaussian_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='gaussian_random_batch_size_like',
inputs={'Input': input},
outputs={'Out': out},
attrs={
'shape': shape,
'input_dim_idx': input_dim_idx,
'output_dim_idx': output_dim_idx,
'mean': mean,
'std': std,
'seed': seed,
'dtype': c_dtype
})
return out
@templatedoc()
def sum(x, use_mkldnn=False):
"""
${comment}
Args:
x (Variable): ${x_comment}
use_mkldnn (Bool): ${use_mkldnn_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('sum', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
helper.append_op(
type='sum',
inputs={'X': x},
outputs={'Out': out},
attrs={'use_mkldnn': use_mkldnn})
return out
@templatedoc()
def slice(input, axes, starts, ends):
"""
${comment}
Args:
input (Variable): ${input_comment}.
axes (List): ${axes_comment}
starts (List): ${starts_comment}
ends (List): ${ends_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('slice', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='slice',
inputs={'Input': input},
outputs={'Out': out},
attrs={'axes': axes,
'starts': starts,
'ends': ends})
return out
@templatedoc()
def shape(input):
"""
${comment}
Args:
input (Variable): ${input_comment}
Returns:
out (Variable): ${out_comment}
"""
helper = LayerHelper('shape', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
helper.append_op(
type='shape', inputs={'Input': input}, outputs={'Out': out})
return out
def _elementwise_op(helper):
op_type = helper.layer_type
x = helper.kwargs.get('x', None)
......
......@@ -45,13 +45,6 @@ __all__ = [
'logical_or',
'logical_xor',
'logical_not',
'uniform_random_batch_size_like',
'gaussian_random',
'sampling_id',
'gaussian_random_batch_size_like',
'sum',
'slice',
'shape',
'maxout',
]
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import dist_ctr_reader
from test_dist_base import TestDistRunnerBase, runtime_main
IS_SPARSE = True
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
class TestDistCTR2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
""" network definition """
dnn_data = fluid.layers.data(
name="dnn_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
lr_data = fluid.layers.data(
name="lr_data",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
label = fluid.layers.data(
name="click",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
# build dnn model
dnn_layer_dims = [128, 64, 32, 1]
dnn_embedding = fluid.layers.embedding(
is_distributed=False,
input=dnn_data,
size=[dnn_input_dim, dnn_layer_dims[0]],
param_attr=fluid.ParamAttr(
name="deep_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
dnn_pool = fluid.layers.sequence_pool(
input=dnn_embedding, pool_type="sum")
dnn_out = dnn_pool
for i, dim in enumerate(dnn_layer_dims[1:]):
fc = fluid.layers.fc(
input=dnn_out,
size=dim,
act="relu",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)),
name='dnn-fc-%d' % i)
dnn_out = fc
# build lr model
lr_embbding = fluid.layers.embedding(
is_distributed=False,
input=lr_data,
size=[lr_input_dim, 1],
param_attr=fluid.ParamAttr(
name="wide_embedding",
initializer=fluid.initializer.Constant(value=0.01)),
is_sparse=IS_SPARSE)
lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
acc = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
label=label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
inference_program = paddle.fluid.default_main_program().clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
sgd_optimizer.minimize(avg_cost)
dataset = dist_ctr_reader.Dataset()
train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
return inference_program, avg_cost, train_reader, test_reader, None, predict
if __name__ == "__main__":
runtime_main(TestDistCTR2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import paddle
import tarfile
logging.basicConfig()
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
"""
avazu_ctr_data/train.txt
avazu_ctr_data/infer.txt
avazu_ctr_data/test.txt
avazu_ctr_data/data.meta.txt
"""
def read_data(file_name):
path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
tar = tarfile.open(path, "r:gz")
tar_info = None
for member in tar.getmembers():
if member.name.endswith(file_name):
tar_info = member
f = tar.extractfile(tar_info)
ret_lines = [_.decode('utf-8') for _ in f.readlines()]
return ret_lines
class TaskMode:
TRAIN_MODE = 0
TEST_MODE = 1
INFER_MODE = 2
def __init__(self, mode):
self.mode = mode
def is_train(self):
return self.mode == self.TRAIN_MODE
def is_test(self):
return self.mode == self.TEST_MODE
def is_infer(self):
return self.mode == self.INFER_MODE
@staticmethod
def create_train():
return TaskMode(TaskMode.TRAIN_MODE)
@staticmethod
def create_test():
return TaskMode(TaskMode.TEST_MODE)
@staticmethod
def create_infer():
return TaskMode(TaskMode.INFER_MODE)
class ModelType:
CLASSIFICATION = 0
REGRESSION = 1
def __init__(self, mode):
self.mode = mode
def is_classification(self):
return self.mode == self.CLASSIFICATION
def is_regression(self):
return self.mode == self.REGRESSION
@staticmethod
def create_classification():
return ModelType(ModelType.CLASSIFICATION)
@staticmethod
def create_regression():
return ModelType(ModelType.REGRESSION)
def load_dnn_input_record(sent):
return list(map(int, sent.split()))
def load_lr_input_record(sent):
res = []
for _ in [x.split(':') for x in sent.split()]:
res.append(int(_[0]))
return res
feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
class Dataset(object):
def train(self):
'''
Load trainset.
'''
file_name = "train.txt"
logger.info("load trainset from %s" % file_name)
mode = TaskMode.create_train()
return self._parse_creator(file_name, mode)
def test(self):
'''
Load testset.
'''
file_name = "test.txt"
logger.info("load testset from %s" % file_name)
mode = TaskMode.create_test()
return self._parse_creator(file_name, mode)
def infer(self):
'''
Load infer set.
'''
file_name = "infer.txt"
logger.info("load inferset from %s" % file_name)
mode = TaskMode.create_infer()
return self._parse_creator(file_name, mode)
def _parse_creator(self, file_name, mode):
'''
Parse dataset.
'''
def _parse():
data = read_data(file_name)
for line_id, line in enumerate(data):
fs = line.strip().split('\t')
dnn_input = load_dnn_input_record(fs[0])
lr_input = load_lr_input_record(fs[1])
if not mode.is_infer():
click = int(fs[2])
yield [dnn_input, lr_input, click]
else:
yield [dnn_input, lr_input]
return _parse
def load_data_meta():
'''
load data meta info from path, return (dnn_input_dim, lr_input_dim)
'''
lines = read_data('data.meta.txt')
err_info = "wrong meta format"
assert len(lines) == 2, err_info
assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
1], err_info
res = map(int, [_.split(':')[1] for _ in lines])
res = list(res)
logger.info('dnn input dim: %d' % res[0])
logger.info('lr input dim: %d' % res[1])
return res
......@@ -47,7 +47,7 @@ def cnn_model(data):
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.3)))
value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
......@@ -56,7 +56,7 @@ def cnn_model(data):
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.2)))
value=0.01)))
SIZE = 10
input_shape = conv_pool_2.shape
......@@ -68,7 +68,7 @@ def cnn_model(data):
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
initializer=fluid.initializer.Constant(value=0.01)))
return predict
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "int64"
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
# For Net
base_lr = 0.2
emb_lr = base_lr * 3
dict_dim = 1500
emb_dim = 128
hid_dim = 128
margin = 0.1
sample_rate = 1
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def get_acc(cos_q_nt, cos_q_pt, batch_size):
cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
cond = fluid.layers.cast(cond, dtype='float64')
cond_3 = fluid.layers.reduce_sum(cond)
acc = fluid.layers.elementwise_div(
cond_3,
fluid.layers.fill_constant(
shape=[1], value=batch_size * 1.0, dtype='float64'),
name="simnet_acc")
return acc
def get_loss(cos_q_pt, cos_q_nt):
loss_op1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
cos_q_pt)
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
return avg_cost
def get_optimizer():
# SGD optimizer
optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
return optimizer
def train_network(batch_size, is_distributed=False, is_sparse=False):
# query
q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
q_emb = fluid.layers.embedding(
input=q,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
q_ss = fluid.layers.softsign(q_sum)
## fc layer after conv
q_fc = fluid.layers.fc(
input=q_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__q_fc__",
learning_rate=base_lr))
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# pt
pt = fluid.layers.data(
name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
pt_emb = fluid.layers.embedding(
input=pt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
pt_ss = fluid.layers.softsign(pt_sum)
## fc layer
pt_fc = fluid.layers.fc(
input=pt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
# nt
nt = fluid.layers.data(
name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
## embedding
nt_emb = fluid.layers.embedding(
input=nt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr),
is_sparse=is_sparse)
## vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
nt_ss = fluid.layers.softsign(nt_sum)
## fc layer
nt_fc = fluid.layers.fc(
input=nt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
# loss
avg_cost = get_loss(cos_q_pt, cos_q_nt)
# acc
acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
return [avg_cost, acc, cos_q_pt]
def combination(x, y):
res = [[[xi, yi] for yi in y] for xi in x]
return res[0]
def get_one_data(file_list):
for file in file_list:
contents = []
with open(file, "r") as fin:
for i in fin:
contents.append(i.strip())
for index, q in enumerate(contents):
try:
one_data = [[int(j) for j in i.split(" ")]
for i in q.split(";")[:-1]]
if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
q = fin.readline()
continue
tmp = combination(one_data[3:3 + one_data[1][0]],
one_data[3 + one_data[1][0]:])
except Exception as e:
continue
for each in tmp:
yield [one_data[2], 0, each[0], each[1]]
def get_batch_reader(file_list, batch_size):
def batch_reader():
res = []
for i in get_one_data(file_list):
if random.random() <= sample_rate:
res.append(i)
if len(res) >= batch_size:
yield res
res = []
return batch_reader
def get_train_reader(batch_size):
# The training data set.
train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
"train")
train_reader = get_batch_reader([train_file], batch_size)
train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
return train_reader, train_feed
class TestDistSimnetBow2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Train program
avg_cost, acc, predict = \
train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer()
opt.minimize(avg_cost)
# Reader
train_reader, _ = get_train_reader(batch_size)
return inference_program, avg_cost, train_reader, train_reader, acc, predict
if __name__ == "__main__":
paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
runtime_main(TestDistSimnetBow2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import six
import tarfile
import string
import re
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "float32"
VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
# Load dictionary.
def load_vocab(filename):
vocab = {}
if six.PY2:
with open(filename, 'r') as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
else:
with open(filename, 'r', encoding="utf-8") as f:
for idx, line in enumerate(f):
vocab[line.strip()] = idx
return vocab
def get_worddict(dict_path):
word_dict = load_vocab(dict_path)
word_dict["<unk>"] = len(word_dict)
dict_dim = len(word_dict)
return word_dict, dict_dim
def conv_net(input,
dict_dim,
emb_dim=128,
window_size=3,
num_filters=128,
fc0_dim=96,
class_dim=2):
emb = fluid.layers.embedding(
input=input,
size=[dict_dim, emb_dim],
is_sparse=False,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=num_filters,
filter_size=window_size,
act="tanh",
pool_type="max",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
fc_0 = fluid.layers.fc(
input=[conv_3],
size=fc0_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
prediction = fluid.layers.fc(
input=[fc_0],
size=class_dim,
act="softmax",
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return prediction
def inference_network(dict_dim):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
out = conv_net(data, dict_dim)
return out
def get_reader(word_dict, batch_size):
# The training data set.
train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
# The testing data set.
test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
return train_reader, test_reader
def get_optimizer(learning_rate):
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
return optimizer
class TestDistTextClassification2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
vocab = os.path.join(paddle.dataset.common.DATA_HOME,
"text_classification", "imdb.vocab")
word_dict, dict_dim = get_worddict(vocab)
# Input data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = conv_net(data, dict_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = get_optimizer(learning_rate=0.001)
opt.minimize(avg_cost)
# Reader
train_reader, test_reader = get_reader(word_dict, batch_size)
return inference_program, avg_cost, train_reader, test_reader, acc, predict
def tokenize(pattern):
"""
Read files that match the given pattern. Tokenize and yield each file.
"""
with tarfile.open(
paddle.dataset.common.download(DATA_URL, 'text_classification',
DATA_MD5)) as tarf:
# Note that we should use tarfile.next(), which does
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = tarf.next()
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip(six.b(
"\n\r")).translate(
None, six.b(string.punctuation)).lower().split()
tf = tarf.next()
def reader_creator(pos_pattern, neg_pattern, word_idx):
UNK = word_idx['<unk>']
INS = []
def load(pattern, out, label):
for doc in tokenize(pattern):
out.append(([word_idx.get(w, UNK) for w in doc], label))
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
def reader():
for doc, label in INS:
yield doc, label
return reader
def train(word_idx):
"""
IMDB training set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
re.compile("train/pos/.*\.txt$"),
re.compile("train/neg/.*\.txt$"), word_idx)
def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
re.compile("test/pos/.*\.txt$"),
re.compile("test/neg/.*\.txt$"), word_idx)
if __name__ == "__main__":
paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
runtime_main(TestDistTextClassification2x2)
......@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
exe.run(startup_prog)
exe.run(pserver_prog)
def run_trainer(self, use_cuda, args):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
TrainTaskConfig.use_gpu = use_cuda
sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
def run_trainer(self, args):
TrainTaskConfig.use_gpu = args.use_cuda
sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
args.is_dist, not args.sync_mode)
if args.is_dist:
......@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
TrainTaskConfig.batch_size = 20
trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place)
TrainTaskConfig.local = not args.is_dist
......
......@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
if __name__ == "__main__":
import os
os.environ['CPU_NUM'] = '1'
os.environ['USE_CUDA'] = "FALSE"
runtime_main(TestDistWord2vec2x2)
......@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t))
str(actual_t) + " in class " + self.__class__.__name__)
if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name +
......
......@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
"StatPos": stat_pos,
"StatNeg": stat_neg
}
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
self.attrs = {
'curve': 'ROC',
'num_thresholds': num_thresholds,
"slide_steps": 1
}
python_auc = metrics.Auc(name="auc",
curve='ROC',
......@@ -45,7 +49,6 @@ class TestAucOp(OpTest):
self.outputs = {
'AUC': np.array(python_auc.eval()),
'BatchAUC': np.array(python_auc.eval()),
'StatPosOut': np.array(python_auc._stat_pos),
'StatNegOut': np.array(python_auc._stat_neg)
}
......
......@@ -20,6 +20,7 @@ import six
import sys
import collections
import math
import paddle.fluid as fluid
from op_test import OpTest
......@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32')
if (len(self.class_pos_count) > 0):
if len(self.class_pos_count) > 0:
self.class_pos_count = np.array(self.class_pos_count).astype(
'int32')
self.true_pos = np.array(self.true_pos).astype('float32')
......@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1]
self.class_pos_count = [0, 2, 1, 0]
self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]]
......
......@@ -18,23 +18,27 @@ import time
import unittest
import os
import sys
import six
import signal
import subprocess
import six
import argparse
import paddle.fluid as fluid
RUN_STEP = 10
class TestDistRunnerBase(object):
def get_model(self, batch_size=2):
raise NotImplementedError(
"get_model should be implemented by child classes.")
def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
trainers, sync_mode):
@staticmethod
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
sync_mode):
# NOTE: import fluid until runtime, or else forking processes will cause error.
import paddle
import paddle.fluid as fluid
t = fluid.DistributeTranspiler()
config = fluid.DistributeTranspilerConfig()
t = fluid.DistributeTranspiler(config=config)
t.transpile(
trainer_id=trainer_id,
program=main_program,
......@@ -44,9 +48,9 @@ class TestDistRunnerBase(object):
return t
def run_pserver(self, args):
import paddle
import paddle.fluid as fluid
self.get_model(batch_size=2)
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program())
t = self.get_transpiler(args.trainer_id,
......@@ -61,12 +65,10 @@ class TestDistRunnerBase(object):
exe.run(startup_prog)
exe.run(pserver_prog)
def run_trainer(self, use_cuda, args):
import paddle
import paddle.fluid as fluid
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
def run_trainer(self, args):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2)
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program())
if args.is_dist:
......@@ -74,16 +76,23 @@ class TestDistRunnerBase(object):
fluid.default_main_program(),
args.endpoints, args.trainers,
args.sync_mode)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy()
if args.use_reduce:
......@@ -92,7 +101,7 @@ class TestDistRunnerBase(object):
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor(
use_cuda,
args.use_cuda,
loss_name=avg_cost.name,
exec_strategy=strategy,
build_strategy=build_stra)
......@@ -103,27 +112,26 @@ class TestDistRunnerBase(object):
]
feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = test_reader()
data = next(reader_generator)
first_loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(data))
print(first_loss)
for i in six.moves.xrange(5):
data = next(reader_generator)
loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
reader_generator = train_reader()
def get_data():
origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
new_batch.append(item)
return new_batch
else:
return origin_batch
data = next(reader_generator)
last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
print(last_loss)
for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data()))
print(loss)
def runtime_main(test_class):
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
parser = argparse.ArgumentParser(description='Run dist test.')
parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer'])
......@@ -135,7 +143,10 @@ def runtime_main(test_class):
'--current_endpoint', type=str, required=False, default="")
parser.add_argument('--sync_mode', action='store_true')
parser.add_argument('--mem_opt', action='store_true')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true')
parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True)
args = parser.parse_args()
......@@ -143,8 +154,7 @@ def runtime_main(test_class):
if args.role == "pserver" and args.is_dist:
model.run_pserver(args)
else:
use_cuda = True if core.is_compiled_with_cuda() else False
model.run_trainer(use_cuda, args)
model.run_trainer(args)
import paddle.compat as cpt
......@@ -163,8 +173,10 @@ class TestDistBase(unittest.TestCase):
self._find_free_port(), self._find_free_port())
self._python_interp = "python"
self._sync_mode = True
self._use_cuda = True
self._mem_opt = False
self._use_reduce = False
self._use_reader_alloc = True
self._setup_config()
def _find_free_port(self):
......@@ -172,7 +184,7 @@ class TestDistBase(unittest.TestCase):
s.bind(('', 0))
return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log):
def start_pserver(self, model_file, check_error_log, required_envs):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
ps0_cmd = ps_cmd % \
......@@ -198,9 +210,15 @@ class TestDistBase(unittest.TestCase):
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen(
ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
ps0_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps0_pipe,
env=required_envs)
ps1_proc = subprocess.Popen(
ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
ps1_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=ps1_pipe,
env=required_envs)
if not check_error_log:
return ps0_proc, ps1_proc, None, None
......@@ -222,58 +240,59 @@ class TestDistBase(unittest.TestCase):
(e, retry_times))
retry_times -= 1
def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
"CPU_NUM": "1"
}
def _run_local(self, model, envs, check_error_log):
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
cmd = "%s %s --role trainer" % (self._python_interp, model)
# Run local to get a base line
if self._use_cuda:
cmd += " --use_cuda"
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
else:
env_local = {'CPU_NUM': '1'}
envs.update(env_local)
if not check_error_log:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen(
local_cmd.split(" "),
cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_local)
stderr=err_log,
env=envs)
else:
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen(
local_cmd.split(" "),
cmd.split(" "),
stdout=subprocess.PIPE,
stderr=err_log,
env=env_local)
stderr=subprocess.PIPE,
env=envs)
local_proc.wait()
out, err = local_proc.communicate()
local_ret = cpt.to_text(out)
sys.stderr.write('local_loss: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % err)
local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(local_out)
if check_error_log:
err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret)
sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n")
return local_losses
def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
check_error_log)
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
tr0_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints,
(self._python_interp, model, self._ps_endpoints,
0, ps0_ep, self._trainers)
tr1_cmd = tr_cmd % \
(self._python_interp, model_file, self._ps_endpoints,
(self._python_interp, model, self._ps_endpoints,
1, ps1_ep, self._trainers)
if self._sync_mode:
......@@ -285,18 +304,28 @@ class TestDistBase(unittest.TestCase):
if self._use_reduce:
tr0_cmd += " --use_reduce"
tr1_cmd += " --use_reduce"
if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc"
if self._use_cuda:
tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
else:
env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'}
env0.update(envs)
env1.update(envs)
FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE
if check_error_log:
print("tr0_cmd:", tr0_cmd)
print("tr1_cmd:", tr1_cmd)
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
......@@ -313,17 +342,11 @@ class TestDistBase(unittest.TestCase):
tr0_proc.wait()
tr1_proc.wait()
out, err = tr0_proc.communicate()
sys.stderr.write('dist_stderr: %s\n' % err)
loss_data0 = cpt.to_text(out)
sys.stderr.write('dist_loss: %s\n' % loss_data0)
lines = loss_data0.split("\n")
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
tr0_out, tr0_err = tr0_proc.communicate()
tr0_loss_text = cpt.to_text(tr0_out)
tr1_out, tr1_err = tr1_proc.communicate()
tr1_loss_text = cpt.to_text(tr1_out)
# close trainer file
if check_error_log:
......@@ -341,5 +364,47 @@ class TestDistBase(unittest.TestCase):
ps1.wait()
FNULL.close()
self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
# print log
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n")
tr1_losses = tr1_loss_text.split("\n")
return tr0_losses, tr1_losses
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
local_losses\
= self._run_local(model_file, required_envs,
check_error_log)
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
check_error_log)
for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0]
tr0_loss = eval(tr0_losses[step_id])[0]
tr1_loss = eval(tr1_losses[step_id])[0]
dist_loss = (tr0_loss + tr1_loss) / 2
print(str(local_loss) + ":" + str(dist_loss))
self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistCTR2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
if __name__ == "__main__":
unittest.main()
......@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
self._use_reduce = False
def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7)
self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnist2x2WithMemopt(TestDistBase):
......@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7)
self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnistAsync(TestDistBase):
......
......@@ -20,9 +20,10 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reader_alloc = False
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
self.check_with_place("dist_se_resnext.py", delta=100)
# TODO(typhoonzero): fix this test
......@@ -38,6 +39,7 @@ class TestDistSeResneXt2x2(TestDistBase):
class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_reader_alloc = False
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBowSparse2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
from test_dist_base import TestDistBase
class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_cuda = False
def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6)
class TestDistTextClassification2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._use_cuda = False
def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100)
if __name__ == "__main__":
unittest.main()
......@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
])
class TestDecayedAdagrad(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
opt.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer, _ = self.get_trainer()
class TestLRDecayConditional(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
......
......@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
self._sync_mode = False
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1)
self.check_with_place("dist_word2vec.py", delta=100)
if __name__ == "__main__":
......
......@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest):
'eta': self.eta
}
print("lod = ", self.lod)
self.outputs = {
'RpnRois': (self.rpn_rois[0], [self.lod]),
'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
......@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest):
self.post_nms_topN = 5000 # train 6000, test 1000
self.nms_thresh = 0.7
self.min_size = 3.0
self.eta = 0.8
self.eta = 1.
def init_test_input(self):
batch_size = 1
......
......@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase):
with program_guard(program):
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input, name="shape")
out = layers.shape(input)
self.assertIsNotNone(out)
print(str(program))
......@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase):
out = layers.expand(x, [1, 2])
print(str(program))
def test_uniform_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.uniform_random_batch_size_like(input, [-1, 11])
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random(self):
program = Program()
with program_guard(program):
out = layers.gaussian_random(shape=[20, 30])
self.assertIsNotNone(out)
print(str(program))
def test_sampling_id(self):
program = Program()
with program_guard(program):
x = layers.data(
name="X",
shape=[13, 11],
dtype='float32',
append_batch_size=False)
out = layers.sampling_id(x)
self.assertIsNotNone(out)
print(str(program))
def test_gaussian_random_batch_size_like(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.gaussian_random_batch_size_like(
input, shape=[-1, 11], mean=1.0, std=2.0)
self.assertIsNotNone(out)
print(str(program))
def test_sum(self):
program = Program()
with program_guard(program):
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.sum(input)
self.assertIsNotNone(out)
print(str(program))
def test_slice(self):
starts = [1, 0, 2]
ends = [3, 3, 4]
axes = [0, 1, 2]
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
out = layers.slice(input, axes=axes, starts=starts, ends=ends)
def test_softshrink(self):
program = Program()
with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestPassBuilder(unittest.TestCase):
def check_network_convergence(self, use_cuda, build_strategy=None):
os.environ['CPU_NUM'] = str(4)
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net()
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=loss.name,
main_program=main,
build_strategy=build_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=test_program,
share_vars_from=train_exe,
build_strategy=build_strategy)
for i in range(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
pass_builder = build_strategy._create_passes_from_strategy()
origin_len = len(pass_builder.all_passes())
viz_pass = pass_builder.append_pass("graph_viz_pass")
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
pass_builder.insert_pass(
len(pass_builder.all_passes()), "graph_viz_pass")
self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
self.check_network_convergence(
use_cuda=core.is_compiled_with_cuda(),
build_strategy=build_strategy)
try:
os.stat("/tmp/test_viz_pass")
except os.error:
self.assertFalse(True)
if __name__ == '__main__':
unittest.main()
......@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
"""
# remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
lr_ops = self._get_lr_ops()
delete_ops(self.origin_program.global_block(), self.optimize_ops)
delete_ops(self.origin_program.global_block(), lr_ops)
self.origin_program.__str__()
if wait_port:
......@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
})
for varname, splited_var in six.iteritems(self.param_var_mapping):
#add concat ops to merge splited parameters received from parameter servers.
# add concat ops to merge splited parameters received from parameter servers.
if len(splited_var) <= 1:
continue
# NOTE: if enable memory optimization, origin vars maybe removed.
......@@ -734,19 +737,14 @@ in a single call.")
table_opt_block = self._create_table_optimize_block(
pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
optimize_blocks.append(table_opt_block)
prefetch_var_name_to_block_id = self._create_prefetch_block(
lookup_table_var_name_to_block_id = self._create_prefetch_block(
pserver_index, pserver_program, table_opt_block)
checkpoint_block_id = self._create_checkpoint_save_block(
pserver_program, table_opt_block.idx)
pserver_program._distributed_lookup_table = self.table_name
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will
# not be executed, so it's safe to use optimize_block to hold the place
if self.has_distributed_lookup_table:
assert len(prefetch_var_name_to_block_id) > 0
else:
assert len(prefetch_var_name_to_block_id) == 0
prefetch_var_name_to_block_id.extend(
lookup_table_var_name_to_block_id)
attrs = {
"optimize_blocks": optimize_blocks,
......@@ -755,11 +753,14 @@ in a single call.")
"sync_mode": self.sync_mode,
"grad_to_block_id": grad_to_block_id,
}
if len(prefetch_var_name_to_block_id) > 0:
attrs['prefetch_var_name_to_block_id'] \
= prefetch_var_name_to_block_id
if self.has_distributed_lookup_table:
attrs['checkpint_block_id'] = checkpoint_block_id
if len(prefetch_var_name_to_block_id) > 0:
attrs[
'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
# step5 append the listen_and_serv op
pserver_program.global_block().append_op(
type="listen_and_serv",
......@@ -1430,6 +1431,9 @@ to transpile() call.")
elif op_type == "rmsprop":
if varkey in ["Moment", "MeanSquare"]:
return param_shape
elif op_type == "decayed_adagrad":
if varkey == "Moment":
return param_shape
elif op_type == "sgd":
pass
return orig_shape
......@@ -1486,7 +1490,6 @@ to transpile() call.")
per_trainer_name = "%s.trainer_%d" % \
(merged_var_name, i)
vars2merge.append(pserver_block.vars[per_trainer_name])
optimize_block.append_op(
type="sum",
inputs={"X": vars2merge},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册