提交 48324c32 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -139,10 +139,12 @@ endfunction() ...@@ -139,10 +139,12 @@ endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0) if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x ...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
......
...@@ -89,6 +89,7 @@ if(CUDNN_FOUND) ...@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
if(NOT CUDNN_MAJOR_VERSION) if(NOT CUDNN_MAJOR_VERSION)
set(CUDNN_VERSION "???") set(CUDNN_VERSION "???")
else() else()
add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
math(EXPR CUDNN_VERSION math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 + "${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
......
...@@ -32,4 +32,4 @@ endif() ...@@ -32,4 +32,4 @@ endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub) LIST(APPEND external_project_dependencies cub)
...@@ -28,4 +28,4 @@ endif() ...@@ -28,4 +28,4 @@ endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND externl_project_dependencies dlpack) LIST(APPEND external_project_dependencies dlpack)
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -72,13 +72,13 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader) ...@@ -72,13 +72,13 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
if (WITH_GPU) if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda) target_link_libraries(var_type_traits dynload_cuda)
endif() endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits) cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
...@@ -189,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS ...@@ -189,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
if(WITH_PSLIB) if(WITH_PSLIB)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
else() else()
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
endif(WITH_PSLIB) endif(WITH_PSLIB)
......
...@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, ...@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// start executing ops in multiple threads // start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) { for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back( if (debug) {
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
workers[thidx].get()));
} else {
threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
}
} }
for (auto& th : threads) { for (auto& th : threads) {
......
...@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() { ...@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way. // this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (NoDummyInputSize() == 1 && if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else #else
......
...@@ -25,7 +25,7 @@ struct ExecutionStrategy { ...@@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
}; };
......
...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; ++drop_scope_counter_;
if (!fetch_tensors.empty() || bool stream_end = false;
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { if (!fetch_tensors.empty()) {
drop_scope_counter_ = 0; WaitComputationalStreams();
// Wait All computational streams stream_end = true;
for (auto p : places_) { }
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
if (!stream_end) {
WaitComputationalStreams();
} }
for (auto &scope : local_scopes_) { for (auto &scope : local_scopes_) {
auto &local_scope = auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>(); *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
drop_scope_counter_ = 0;
} }
if (eptr) { if (eptr) {
std::rethrow_exception(eptr); std::rethrow_exception(eptr);
......
...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override; FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
private:
inline void WaitComputationalStreams() {
// Wait All computational streams
for (auto p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
private: private:
size_t drop_scope_counter_{0}; size_t drop_scope_counter_{0};
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() { ...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
return; return;
#else #else
static unsigned concurrency_cap = std::thread::hardware_concurrency(); static unsigned concurrency_cap = std::thread::hardware_concurrency();
LOG(WARNING) << "concurrency capacity " << concurrency_cap;
int thread_id = this->thread_id_; int thread_id = this->thread_id_;
if (static_cast<unsigned>(thread_id) < concurrency_cap) { if (static_cast<unsigned>(thread_id) < concurrency_cap) {
...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { ...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
} }
void ExecutorThreadWorker::TrainFilesWithTimer() {
platform::SetNumThreads(1);
SetDevice();
thread_reader_->Start();
std::vector<double> op_total_time;
std::vector<std::string> op_name;
for (auto& op : ops_) {
op_name.push_back(op->Type());
}
op_total_time.resize(ops_.size());
for (size_t i = 0; i < op_total_time.size(); ++i) {
op_total_time[i] = 0.0;
}
platform::Timer timeline;
double total_time = 0.0;
double read_time = 0.0;
int cur_batch;
int batch_cnt = 0;
timeline.Start();
while ((cur_batch = thread_reader_->Next()) > 0) {
timeline.Pause();
read_time += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
for (size_t i = 0; i < ops_.size(); ++i) {
timeline.Start();
ops_[i]->Run(*thread_scope_, place_);
timeline.Pause();
op_total_time[i] += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
}
++batch_cnt;
thread_scope_->DropKids();
if (thread_id_ == 0) {
if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
for (size_t i = 0; i < ops_.size(); ++i) {
fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
op_name[i].c_str(), op_total_time[i] / batch_cnt);
}
fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
int fetch_var_num = fetch_var_names_.size();
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
}
}
}
timeline.Start();
}
}
void ExecutorThreadWorker::TrainFiles() { void ExecutorThreadWorker::TrainFiles() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr( ...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) { std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
_pslib_ptr = pslib_ptr; _pslib_ptr = pslib_ptr;
} }
void AsyncExecutorThreadWorker::SetPullDenseThread( void AsyncExecutorThreadWorker::SetPullDenseThread(
std::shared_ptr<DensePullThread> dpt) { std::shared_ptr<DensePullThread> dpt) {
_pull_dense_thread = dpt; _pull_dense_thread = dpt;
} }
void AsyncExecutorThreadWorker::TrainOneNetwork() { void AsyncExecutorThreadWorker::TrainOneNetwork() {
PrepareParams(); PrepareParams();
......
...@@ -155,6 +155,8 @@ class ExecutorThreadWorker { ...@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed); void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function // A multi-thread training function
virtual void TrainFiles(); virtual void TrainFiles();
// with timer log
virtual void TrainFilesWithTimer();
// set fetch variable names from python interface assigned by users // set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names); void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
......
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
#if !defined(_WIN32) #if !defined(_WIN32)
#include <pthread.h> #include <pthread.h>
#endif // !_WIN32 #else
#include <mutex> // NOLINT
#endif // !_WIN32
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,17 +31,17 @@ struct RWLock { ...@@ -29,17 +31,17 @@ struct RWLock {
~RWLock() { pthread_rwlock_destroy(&lock_); } ~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() { inline void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed"); "acquire read lock failed");
} }
void WRLock() { inline void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed"); "acquire write lock failed");
} }
void UNLock() { inline void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
} }
...@@ -51,81 +53,46 @@ struct RWLock { ...@@ -51,81 +53,46 @@ struct RWLock {
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing. // In windows, rw_lock seems like a hack. Use empty object and do nothing.
struct RWLock { struct RWLock {
void RDLock() {} // FIXME(minqiyang): use mutex here to do fake lock
void WRLock() {} inline void RDLock() { mutex_.lock(); }
void UNLock() {}
inline void WRLock() { mutex_.lock(); }
inline void UNLock() { mutex_.unlock(); }
private:
std::mutex mutex_;
}; };
#endif #endif
class RWLockGuard { class AutoWRLock {
public: public:
enum Status { kUnLock, kWRLock, kRDLock }; explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() { ~AutoWRLock() { UnLock(); }
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() { private:
switch (status_) { inline void Lock() { lock_->WRLock(); }
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() { inline void UnLock() { lock_->UNLock(); }
if (status_ != Status::kUnLock) {
lock_->UNLock(); private:
status_ = Status::kUnLock; RWLock* lock_;
} };
}
class AutoRDLock {
public:
explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
~AutoRDLock() { UnLock(); }
private:
inline void Lock() { lock_->RDLock(); }
~RWLockGuard() { UnLock(); } inline void UnLock() { lock_->UNLock(); }
private: private:
RWLock* lock_; RWLock* lock_;
Status status_;
}; };
} // namespace framework } // namespace framework
......
...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false, ...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
// the mutex will cause serious performance issue. // the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`. // So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE #ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD #define SCOPE_KIDS_READER_LOCK
#define SCOPE_KIDS_WRITER_LOCK
#define SCOPE_VARS_READER_LOCK
#define SCOPE_VARS_WRITER_LOCK
#else #else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_); #define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
#endif #endif
namespace paddle { namespace paddle {
...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } ...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
SCOPE_LOCK_GUARD Scope* child = new Scope(this);
kids_.push_back(new Scope(this)); {
return *kids_.back(); SCOPE_KIDS_WRITER_LOCK
kids_.push_back(child);
}
return *child;
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
SCOPE_LOCK_GUARD
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const { Variable* Scope::FindLocalVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarLocally(name); return FindVarLocally(name);
} }
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_READER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
SCOPE_LOCK_GUARD
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); {
for (auto& p : vars_) { SCOPE_VARS_READER_LOCK
known_vars.emplace_back(p.first); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) {
known_vars.emplace_back(p.first);
}
} }
return known_vars; return known_vars;
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
this, scope); this, scope);
...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
SCOPE_LOCK_GUARD
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
SCOPE_VARS_WRITER_LOCK
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
it = vars_.erase(it); it = vars_.erase(it);
...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
......
...@@ -14,12 +14,18 @@ limitations under the License. */ ...@@ -14,12 +14,18 @@ limitations under the License. */
#pragma once #pragma once
extern "C" {
#include <xxhash.h>
}
#include <list> #include <list>
#include <mutex> // NOLINT #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -95,7 +101,14 @@ class Scope { ...@@ -95,7 +101,14 @@ class Scope {
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
protected: protected:
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_; struct KeyHasher {
std::size_t operator()(const std::string& key) const {
return XXH32(key.c_str(), key.size(), 1);
}
};
mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
vars_;
private: private:
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
...@@ -124,7 +137,8 @@ class Scope { ...@@ -124,7 +137,8 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private: private:
mutable std::mutex mutex_; mutable RWLock kids_lock_;
mutable RWLock vars_lock_;
}; };
// Generate some debug string about the inherience structure of scope, quite // Generate some debug string about the inherience structure of scope, quite
......
...@@ -19,6 +19,10 @@ limitations under the License. */ ...@@ -19,6 +19,10 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
DECLARE_int64(cudnn_exhaustive_search_times);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; ...@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
template <typename TAlgorithm> template <typename TAlgorithm>
class AlgorithmsCache { class AlgorithmsCache {
public: public:
AlgorithmsCache() : search_times_(0) { hash_.clear(); }
// Caches the best algorithm for a given // Caches the best algorithm for a given
// combination of tensor dimensions & compute data type. // combination of tensor dimensions & compute data type.
TAlgorithm GetAlgorithm( TAlgorithm GetAlgorithm(
...@@ -54,9 +59,14 @@ class AlgorithmsCache { ...@@ -54,9 +59,14 @@ class AlgorithmsCache {
int algorithmFlags, // can set for different data type int algorithmFlags, // can set for different data type
std::function<TAlgorithm()> gen_func); std::function<TAlgorithm()> gen_func);
TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
std::function<TAlgorithm()> gen_func);
private: private:
std::unordered_map<int64_t, TAlgorithm> hash_; std::unordered_map<int64_t, TAlgorithm> hash_;
std::mutex mutex_; std::mutex mutex_;
int search_times_;
}; };
template <typename TAlgorithm> template <typename TAlgorithm>
...@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm( ...@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
return hash_[seed]; return hash_[seed];
} }
template <typename TAlgorithm>
TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
int64_t area, int search_times, int algorithmFlags,
std::function<TAlgorithm()> gen_func) {
if (hash_.find(area) != hash_.end()) {
return hash_[area];
}
if (search_times_ < search_times) {
auto algo = gen_func();
hash_[area] = algo;
++search_times_;
return algo;
}
TAlgorithm algo;
int64_t min = static_cast<uint64_t>(INT_MAX);
for (const auto& m : hash_) {
if (m.first < min) {
min = m.first;
algo = m.second;
}
}
return algo;
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,6 +28,8 @@ namespace operators { ...@@ -28,6 +28,8 @@ namespace operators {
// x is Input, // x is Input,
// z is ResidualData, // z is ResidualData,
// bias is Bias // bias is Bias
// When `split_channels` is set, y will be splitted into multiple outputs,
// each output has split_channels[i] number of channels.
class Conv2DFusionOpMaker : public Conv2DOpMaker { class Conv2DFusionOpMaker : public Conv2DOpMaker {
protected: protected:
void Apply() override { void Apply() override {
...@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { ...@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'") "'relux' , 'tanh', 'band_pass'")
.SetDefault("relu"); .SetDefault("relu");
AddAttr<std::vector<int>>(
"split_channels",
"When `split_channels` are set, there will be multiple outputs, the "
"output size is equal to the number of `split_channels`.")
.SetDefault({});
AddOutput("Outputs",
"This Outputs is used when setting `split_channels`."
"Usually used to fuse conv with same input and same filter size, "
"padding, stride, dilation size.")
.AsDuplicable()
.AsDispensable();
AddInput("AlgoCache",
"The cache of convolution algorithm, a RAW type variable.")
.AsDispensable();
AddAttr<int>(
"search_times",
"The number of exhaustive search times for convolution algorithm.")
.SetDefault(-1);
} }
}; };
class Conv2DFusionOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of ConvOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations =
ctx->Attrs().Get<std::vector<int>>("dilations");
std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i], strides[i]));
}
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of ConvOp should not be null.");
ctx->SetOutputDim("Output", framework::make_ddim(oshape));
std::vector<int> channels =
ctx->Attrs().Get<std::vector<int>>("split_channels");
if (channels.size()) {
PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
"Output(Outputs) of ConvOp should not be null.");
std::vector<framework::DDim> oshapes;
oshapes.reserve(channels.size());
for (size_t i = 0; i < channels.size(); ++i) {
oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
}
ctx->SetOutputsDim("Outputs", oshapes);
}
}
};
// TODO(qingqing): add gradient operator for conv2d_fusion // TODO(qingqing): add gradient operator for conv2d_fusion
} // namespace operators } // namespace operators
...@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { ...@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker, REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker); ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
paddle::framework::EmptyGradOpMaker);
...@@ -16,8 +16,9 @@ limitations under the License. */ ...@@ -16,8 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit); DEFINE_int64(cudnn_exhaustive_search_times, -1,
DECLARE_bool(cudnn_exhaustive_search); "Exhaustive search times for cuDNN convolution, "
"defalut is 1, only search once.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
auto search_func = [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
<< stat.memory;
}
return fwd_perf_stat[0].algo;
};
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr; AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { int search_times = ctx.Attr<int>("search_times");
search_times = std::max(
static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
if (search_times > 0) {
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto var_name = ctx.Inputs("AlgoCache")[0];
algo_cache = algo_cache =
ctx.scope() ctx.scope()
.FindVar(kCUDNNFwdAlgoCache) .FindVar(var_name)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(); ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
search_func);
} else { } else {
algo_cache = // Cache searched algo in Var(kCUDNNFwdAlgoCache).
const_cast<framework::Scope&>(ctx.scope()) // all conv ops use the same kCUDNNFwdAlgoCache variable.
.Var(kCUDNNFwdAlgoCache) if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(); algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
// TODO(qingqing) remove const_cast
algo_cache =
const_cast<framework::Scope*>(ctx.scope().parent())
->Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
dilations, 0, search_func);
} }
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return fwd_perf_stat[0].algo;
});
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
} }
...@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
if (channels.size()) {
auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
if (x_dims[0] == 1) {
// share data with Output
framework::Tensor t;
t.ShareDataWith(*output);
auto y_dims = output->dims();
t.Resize({y_dims[1], y_dims[2], y_dims[3]});
int s = 0;
for (size_t i = 0; i < channels.size(); ++i) {
int e = s + channels[i];
outs[i]->ShareDataWith(t.Slice(s, e));
outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
s = e;
}
} else {
// TODO(qingiqng): do copy when batch size large than 1
PADDLE_THROW("Batch size greater than 1 is Unsupported");
}
}
} }
}; };
#endif #endif
......
...@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) { ...@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
framework::Scope* scope = new framework::Scope(); framework::Scope* scope = new framework::Scope();
framework::Variable* var = scope->Var("var1"); framework::Variable* var = scope->Var("var1");
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
slr->set_height(1000); slr->set_height(20000);
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows(); auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({3, 5})); tensor->Resize(framework::make_ddim({20000, 1024}));
tensor->mutable_data<float>(place); tensor->mutable_data<float>(place);
paddle::operators::math::set_constant(ctx, tensor, 32.7); paddle::operators::math::set_constant(ctx, tensor, 32.7);
...@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars, ...@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
} }
TEST(PREFETCH, GPU) { TEST(PREFETCH, GPU) {
setenv("FLAGS_max_body_size", "2147483647", 1);
platform::CUDAPlace place; platform::CUDAPlace place;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
......
include(operators) include(operators)
register_operators(EXCLUDES fusion_transpose_flatten_concat_op) register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
if (WITH_GPU) if (WITH_GPU)
op_library(fusion_transpose_flatten_concat_op) op_library(fusion_transpose_flatten_concat_op)
op_library(fusion_conv_inception_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
class ConvInceptionFusionOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
// 1 x
auto in_dims = ctx->GetInputDim("Input");
// 4 filters
auto w_dims = ctx->GetInputsDim("Filter");
PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
int n = in_dims[0];
// compute output channel
// 1st channel
int c = w_dims[0][0];
// add 2nd channel
c += (w_dims[1][0] - w_dims[2][1] * 2);
// add 3rd channel
c += (w_dims[2][0] - w_dims[3][1]);
// add 4-th channel
c += w_dims[3][0];
int h = in_dims[2];
int w = in_dims[3];
ctx->SetOutputDim("Output", {n, c, h, w});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
}
};
class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
protected:
void Make() override {
AddInput("Input", "(Tensor) NCHW layout.");
AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
.AsDuplicable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddOutput("TempOutput", "").AsDuplicable();
AddAttr<std::string>(
"pooling_type",
"(string), pooling type, can be \"max\" for max-pooling "
"and \"avg\" for average-pooling.")
.InEnum({"max", "avg"});
AddAttr<bool>(
"exclusive",
"(bool, default True) When true, will exclude the zero-padding in the "
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True.")
.SetDefault(true);
AddAttr<std::string>(
"activation",
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'")
.SetDefault("relu");
AddAttr<int>("workspace_size_MB",
"Only used in cudnn kernel. Need set use_cudnn to true."
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.")
.SetDefault(4096);
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
ops::ConvInceptionFusionOpMaker,
paddle::framework::EmptyGradOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit);
namespace paddle {
namespace operators {
#if CUDNN_VERSION >= 7001
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
using DataLayout = platform::DataLayout;
using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
using PoolingMode = platform::PoolingMode;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* input = ctx.Input<Tensor>("Input");
auto filters = ctx.MultiInput<framework::Tensor>("Filter");
auto bias = ctx.MultiInput<framework::Tensor>("Bias");
auto* output = ctx.Output<Tensor>("Output");
auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
const std::string pool_type = ctx.Attr<std::string>("pooling_type");
const std::string activation = ctx.Attr<std::string>("activation");
const bool exclusive = ctx.Attr<bool>("exclusive");
int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
DataLayout layout = DataLayout::kNCHW;
std::vector<int> in_dim = framework::vectorize2int(input->dims());
// ------------------- cudnn descriptors ---------------------
PoolingMode pooling_mode;
if (pool_type == "max") {
pooling_mode = PoolingMode::kMaximum;
} else {
pooling_mode = exclusive ? PoolingMode::kAverageExclusive
: (PoolingMode::kAverageInclusive);
}
std::vector<int> k0x0 = {0, 0};
std::vector<int> k1x1 = {1, 1};
std::vector<int> k1x1_2 = {1, 1};
std::vector<int> k3x3 = {3, 3};
ScopedPoolingDescriptor pool_desc;
ScopedActivationDescriptor act_desc;
ScopedTensorDescriptor out_pool_desc;
ScopedTensorDescriptor input_desc;
cudnnPoolingDescriptor_t cudnn_pool_desc =
pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
cudnnConvolutionDescriptor_t* conv_desc =
new cudnnConvolutionDescriptor_t[4];
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
}
std::vector<std::vector<int>> filter_dims;
std::vector<std::vector<int>> bias_dims;
std::vector<std::vector<int>> in_dims;
std::vector<std::vector<int>> out_dims;
std::vector<std::vector<int>> in_strides;
std::vector<std::vector<int>> out_strides;
std::vector<std::vector<int>> bias_strides;
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
int n = in_dim[0];
int h = in_dim[2];
int w = in_dim[3];
int oc = output->dims()[1];
cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
? CUDNN_DATA_DOUBLE
: CUDNN_DATA_FLOAT;
for (int i = 0; i < 4; ++i) {
filter_dims.push_back(framework::vectorize2int(filters[i]->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
bias_dims.push_back({1, filter_dims[i][0], 1, 1});
bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
bias_strides[i].data()));
in_dims.push_back({n, filter_dims[i][1], h, w});
out_dims.push_back({n, filter_dims[i][0], h, w});
in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
out_strides.push_back({oc * h * w, h * w, w, 1});
if (i < 2) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
}
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
conv_desc[i], CUDNN_DEFAULT_MATH));
}
in_dims[2][1] *= 2;
in_strides[2][0] = oc * h * w;
out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous.
in_strides[3][0] = filter_dims[2][0] * h * w;
CUDNN_ENFORCE(
platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
cudnnConvolutionFwdAlgo_t algo[4];
auto handle = dev_ctx.cudnn_handle();
size_t workspace_size_in_bytes = 0; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
}
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
out_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
&algo[i]));
size_t tmp_size = 0;
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
algo[i], &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
cudnnActivationDescriptor_t cudnn_act_desc =
act_desc.descriptor<T>(activation);
int oc0 = filter_dims[0][0];
int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
int oc3 = filter_dims[3][0];
int oc2 = oc - oc0 - oc1 - oc3;
// branch1: pool + 1x1 conv
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
pool_out_desc, temp_data));
std::vector<const void*> in_datas;
in_datas.push_back(static_cast<const void*>(temp_data));
in_datas.push_back(static_cast<const void*>(input_data));
in_datas.push_back(
static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
T* temp2_data = temp_outs[1]->mutable_data<T>(
framework::make_ddim(out_dims[2]), ctx.GetPlace());
in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
std::vector<void*> out_datas;
out_datas.push_back(static_cast<void*>(output_data));
out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
out_datas.push_back(static_cast<void*>(temp2_data));
out_datas.push_back(
static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
for (int i = 0; i < 4; ++i) {
auto func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
out_desc[i], out_datas[i], bias_desc[i],
static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
out_desc[i], out_datas[i]));
};
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle.RunFunc(func, workspace_size_in_bytes);
}
cudnnTensorDescriptor_t x_desc;
cudnnTensorDescriptor_t y_desc;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), x_desc,
static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
}
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
}
};
#endif
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7001
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
ops::CUDNNConvInceptionFusionOpKernel<float>,
ops::CUDNNConvInceptionFusionOpKernel<double>);
#endif
...@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) ...@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
......
...@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); ...@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R6
CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif #endif
......
...@@ -53,6 +53,12 @@ namespace platform { ...@@ -53,6 +53,12 @@ namespace platform {
namespace dynload { namespace dynload {
static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
#endif
static inline std::string join(const std::string& part1, static inline std::string join(const std::string& part1,
const std::string& part2) { const std::string& part2) {
// directory separator // directory separator
...@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
void* GetCublasDsoHandle() { void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif #endif
...@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { ...@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
void* GetCUDNNDsoHandle() { void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
#endif #endif
...@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { ...@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
void* GetCurandDsoHandle() { void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace platform {
void Timer::Reset() {
_start.tv_sec = 0;
_start.tv_usec = 0;
_count = 0;
_elapsed = 0;
_paused = true;
}
void Timer::Start() {
Reset();
Resume();
}
void Timer::Pause() {
if (_paused) {
return;
}
_elapsed += Tickus();
++_count;
_paused = true;
}
void Timer::Resume() {
gettimeofday(&_start, NULL);
_paused = false;
}
int Timer::Count() { return _count; }
double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
double Timer::ElapsedMS() { return _elapsed / 1000.0; }
double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
int64_t Timer::Tickus() {
gettimeofday(&_now, NULL);
return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
(_now.tv_usec - _start.tv_usec);
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
static unsigned sleep(unsigned seconds) {
Sleep(seconds * 1000);
return 0;
}
#endif
namespace paddle {
namespace platform {
// A Standard Timer implementation for debugging
class Timer {
public:
// a timer class for profiling
// Reset() will be called during initialization
// all timing variables will be set 0 in Reset()
Timer() { Reset(); }
void Reset();
void Start();
void Pause();
// Resume will get current system time
void Resume();
int Count();
// return elapsed time in us
double ElapsedUS();
// return elapsed time in ms
double ElapsedMS();
// return elapsed time in sec
double ElapsedSec();
private:
struct timeval _start;
struct timeval _now;
int _count;
int _elapsed;
bool _paused;
// get us difference between start and now
int64_t Tickus();
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/timer.h"
#include "gtest/gtest.h"
TEST(Timer, Reset) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
timeline.Reset();
}
TEST(Timer, Start) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
}
TEST(Timer, Pause) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
}
TEST(Timer, Resume) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
timeline.Resume();
}
...@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() { ...@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() {
} }
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) #ifndef PADDLE_WITH_DISTRIBUTE
return true;
#else
return false; return false;
#endif #endif
#ifdef PADDLE_WITH_GRPC
return false;
#endif
return true;
} }
bool IsCompiledWithDIST() { bool IsCompiledWithDIST() {
......
...@@ -28,20 +28,53 @@ int main(int argc, char** argv) { ...@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
for (int i = 0; i < argc; ++i) { for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]); new_argv.push_back(argv[i]);
} }
std::vector<std::string> envs;
std::vector<std::string> undefok;
#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
envs.push_back("max_body_size");
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv.push_back( envs.push_back("fraction_of_gpu_memory_to_use");
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); envs.push_back("allocator_strategy");
#elif __clang__ #elif __clang__
new_argv.push_back( envs.push_back("use_mkldnn");
strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" envs.push_back("initial_cpu_memory_in_mb");
"mb,allocator_strategy")); envs.push_back("allocator_strategy");
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
undefok.push_back("use_mkldnn");
undefok.push_back("initial_cpu_memory_in_mb");
#else #else
new_argv.push_back( envs.push_back("use_pinned_memory");
strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" envs.push_back("use_mkldnn");
"mb,allocator_strategy")); envs.push_back("initial_cpu_memory_in_mb");
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); envs.push_back("allocator_strategy");
undefok.push_back("use_mkldnn");
undefok.push_back("initial_cpu_memory_in_mb");
#endif #endif
if (envs.size() > 0) {
std::string env_string = "--tryfromenv=";
for (auto t : envs) {
env_string += t + ",";
}
env_string = env_string.substr(0, env_string.length() - 1);
new_argv.push_back(strdup(env_string.c_str()));
VLOG(1) << "gtest env_string:" << env_string;
}
if (undefok.size() > 0) {
std::string undefok_string = "--undefok=";
for (auto t : undefok) {
undefok_string += t + ",";
}
undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
new_argv.push_back(strdup(undefok_string.c_str()));
VLOG(1) << "gtest undefok_string:" << undefok_string;
}
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
......
...@@ -151,12 +151,21 @@ def __bootstrap__(): ...@@ -151,12 +151,21 @@ def __bootstrap__():
read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_get_thread_num')
read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_prefetch_thread_num')
read_env_flags.append('rpc_disable_reuse_port') read_env_flags.append('rpc_disable_reuse_port')
if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size')
#set brpc max body size
os.environ['FLAGS_max_body_size'] = "2147483647"
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_deterministic',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' 'enable_cublas_tensor_op_math',
'conv_workspace_size_limit',
'cudnn_exhaustive_search',
'memory_optimize_debug',
'selected_gpus',
'cudnn_exhaustive_search_times',
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
......
...@@ -272,8 +272,7 @@ class DataFeeder(object): ...@@ -272,8 +272,7 @@ class DataFeeder(object):
dict: the result of conversion. dict: the result of conversion.
Raises: Raises:
ValueError: If drop_last is False and the data batch which cannot ValueError: If drop_last is False and the data batch which cannot fit for devices.
fit for devices.
""" """
def __reader_creator__(): def __reader_creator__():
......
...@@ -647,20 +647,16 @@ class Operator(object): ...@@ -647,20 +647,16 @@ class Operator(object):
self.desc.set_input(in_proto.name, []) self.desc.set_input(in_proto.name, [])
if outputs is not None: if outputs is not None:
given = set()
need = set()
for n in outputs:
given.add(n)
for m in proto.outputs: for m in proto.outputs:
need.add(m.name) if (m.name not in outputs) and m.dispensable:
if not given == need: continue
raise ValueError(("Incorrect setting for output(s) of " if not ((m.name in outputs) or m.dispensable):
"operator \"%s\". Need: [%s] Given: [%s]") % raise ValueError(
(type, ("Incorrect setting for output(s) of "
", ".join(six.binary_type(e) for e in need), "operator \"%s\", should set: [%s].") % (type, m.name))
", ".join(six.binary_type(e) for e in given)))
for out_proto in proto.outputs: for out_proto in proto.outputs:
if out_proto.name not in outputs:
continue
out_args = outputs[out_proto.name] out_args = outputs[out_proto.name]
if not isinstance(out_args, list): if not isinstance(out_args, list):
out_args = [out_args] out_args = [out_args]
...@@ -1638,8 +1634,8 @@ class Program(object): ...@@ -1638,8 +1634,8 @@ class Program(object):
parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
to print. to print.
Returns Returns:
(str): The debug string. str : The debug string.
Raises: Raises:
ValueError: If any of required fields is not set and throw_on_error is ValueError: If any of required fields is not set and throw_on_error is
......
...@@ -1452,6 +1452,7 @@ class DynamicRNN(object): ...@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
def step_input(self, x): def step_input(self, x):
""" """
Mark a sequence as a dynamic RNN input. Mark a sequence as a dynamic RNN input.
Args: Args:
x(Variable): The input sequence. x(Variable): The input sequence.
...@@ -1505,6 +1506,7 @@ class DynamicRNN(object): ...@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
""" """
Mark a variable as a RNN input. The input will not be scattered into Mark a variable as a RNN input. The input will not be scattered into
time steps. time steps.
Args: Args:
x(Variable): The input variable. x(Variable): The input variable.
...@@ -1629,13 +1631,11 @@ class DynamicRNN(object): ...@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
Args: Args:
init(Variable|None): The initialized variable. init(Variable|None): The initialized variable.
shape(list|tuple): The memory shape. NOTE the shape does not contain shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
batch_size.
value(float): the initalized value. value(float): the initalized value.
need_reorder(bool): True if the initialized memory depends on the need_reorder(bool): True if the initialized memory depends on the input sample.
input sample.
dtype(str|numpy.dtype): The data type of the initialized memory. dtype(str|numpy.dtype): The data type of the initialized memory.
...@@ -1714,6 +1714,7 @@ class DynamicRNN(object): ...@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
""" """
Update the memory from ex_mem to new_mem. NOTE that the shape and data Update the memory from ex_mem to new_mem. NOTE that the shape and data
type of :code:`ex_mem` and :code:`new_mem` must be same. type of :code:`ex_mem` and :code:`new_mem` must be same.
Args: Args:
ex_mem(Variable): the memory variable. ex_mem(Variable): the memory variable.
new_mem(Variable): the plain variable generated in RNN block. new_mem(Variable): the plain variable generated in RNN block.
......
...@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred, ...@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
rpn_negative_overlap=0.3, rpn_negative_overlap=0.3,
use_random=True): use_random=True):
""" """
** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
This layer can be, for given the Intersection-over-Union (IoU) overlap This layer can be, for given the Intersection-over-Union (IoU) overlap
between anchors and ground truth boxes, to assign classification and between anchors and ground truth boxes, to assign classification and
...@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred, ...@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred,
Examples: Examples:
.. code-block:: python .. code-block:: python
bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
cls_logits = layers.data(name='cls_logits', shape=[100, 1], cls_logits = layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[20, 4], anchor_box = layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
cls_logits=cls_logits, cls_logits=cls_logits,
anchor_box=anchor_box, anchor_box=anchor_box,
gt_boxes=gt_boxes) gt_boxes=gt_boxes)
""" """
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
...@@ -1519,27 +1520,30 @@ def anchor_generator(input, ...@@ -1519,27 +1520,30 @@ def anchor_generator(input,
Args: Args:
input(Variable): The input feature map, the format is NCHW. input(Variable): The input feature map, the format is NCHW.
anchor_sizes(list|tuple|float): The anchor sizes of generated anchors, anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
given in absolute pixels e.g. [64., 128., 256., 512.]. given in absolute pixels e.g. [64., 128., 256., 512.].
For instance, the anchor size of 64 means the area of this anchor equals to 64**2. For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
aspect_ratios(list|tuple|float): The height / width ratios of generated aspect_ratios(list|tuple|float): The height / width ratios of generated
anchors, e.g. [0.5, 1.0, 2.0]. anchors, e.g. [0.5, 1.0, 2.0].
variance(list|tuple): The variances to be used in box regression deltas. variance(list|tuple): The variances to be used in box regression deltas.
Default:[0.1, 0.1, 0.2, 0.2]. Default:[0.1, 0.1, 0.2, 0.2].
stride(list|turple): The anchors stride across width and height, stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
e.g. [16.0, 16.0]
offset(float): Prior boxes center offset. Default: 0.5 offset(float): Prior boxes center offset. Default: 0.5
name(str): Name of the prior box op. Default: None. name(str): Name of the prior box op. Default: None.
Returns: Returns:
Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. Anchors(Variable),Variances(Variable):
H is the height of input, W is the width of input,
num_anchors is the box count of each position. two variables:
Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
Variances(Variable): The expanded variances of anchors - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
with a layout of [H, W, num_priors, 4]. H is the height of input, W is the width of input, \
H is the height of input, W is the width of input num_anchors is the box count of each position. \
num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
Each variance is in (xcenter, ycenter, w, h) format. - Variances(Variable): The expanded variances of anchors \
with a layout of [H, W, num_priors, 4]. \
H is the height of input, W is the width of input \
num_anchors is the box count of each position. \
Each variance is in (xcenter, ycenter, w, h) format.
Examples: Examples:
...@@ -1748,35 +1752,35 @@ def generate_proposals(scores, ...@@ -1748,35 +1752,35 @@ def generate_proposals(scores,
eta=1.0, eta=1.0,
name=None): name=None):
""" """
** Generate proposal Faster-RCNN ** **Generate proposal Faster-RCNN**
This operation proposes RoIs according to each box with their probability to be a foreground object and This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
could be used to train detection net. could be used to train detection net.
For generating proposals, this operation performs following steps: For generating proposals, this operation performs following steps:
1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
2. Calculate box locations as proposals candidates. 2. Calculate box locations as proposals candidates.
3. Clip boxes to image 3. Clip boxes to image
4. Remove predicted boxes with small area. 4. Remove predicted boxes with small area.
5. Apply NMS to get final proposals as output. 5. Apply NMS to get final proposals as output.
Args:
Args: scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. N is batch size, A is number of anchors, H and W are height and width of the feature map.
N is batch size, A is number of anchors, H and W are height and width of the feature map. bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location.
bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale between origin image size and the size of feature map.
between origin image size and the size of feature map. anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. nms_thresh(float): Threshold in NMS, 0.5 by default.
nms_thresh(float): Threshold in NMS, 0.5 by default. min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
""" """
helper = LayerHelper('generate_proposals', **locals()) helper = LayerHelper('generate_proposals', **locals())
......
...@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size): ...@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
is determined by argument buf_size. is determined by argument buf_size.
Args: Args:
param reader: the original reader whose output will be shuffled. reader(callable): the original reader whose output will be shuffled.
type reader: callable buf_size(int): shuffle buffer size.
param buf_size: shuffle buffer size.
type buf_size: int Returns:
return: the new reader whose output is shuffled. callable: the new reader whose output is shuffled.
rtype: callable
""" """
return __create_unshared_decorated_reader__( return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
......
此差异已折叠。
...@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input, ...@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
It also sets *stop_gradient* to True. It also sets *stop_gradient* to True.
>>> data = fluid.layers.fill_constant_batch_size_like(
>>> input=like, shape=[1], value=0, dtype='int64')
Args: Args:
input(${input_type}): ${input_comment}. input(${input_type}): ${input_comment}.
...@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input, ...@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
Returns: Returns:
${out_comment}. ${out_comment}.
Examples:
.. code-block:: python
data = fluid.layers.fill_constant_batch_size_like(
input=like, shape=[1], value=0, dtype='int64')
""" """
helper = LayerHelper("fill_constant_batch_size_like", **locals()) helper = LayerHelper("fill_constant_batch_size_like", **locals())
out = helper.create_variable_for_type_inference(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
......
...@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase): ...@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase):
Accumulate counter numbers output by chunk_eval from mini-batches and Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter compute the precision recall and F1-score using the accumulated counter
numbers. numbers.
For some basics of chunking, please refer to For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'. `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
...@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase): ...@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
""" """
Update the states based on the layers.chunk_eval() ouputs. Update the states based on the layers.chunk_eval() ouputs.
Args: Args:
num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
...@@ -450,9 +451,9 @@ class EditDistance(MetricBase): ...@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
distance, instance_error = distance_evaluator.eval() distance, instance_error = distance_evaluator.eval()
In the above example: In the above example:
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass. - 'distance' is the average of the edit distance in a pass.
- 'instance_error' is the instance error rate in a pass.
""" """
...@@ -567,12 +568,15 @@ class DetectionMAP(object): ...@@ -567,12 +568,15 @@ class DetectionMAP(object):
Calculate the detection mean average precision (mAP). Calculate the detection mean average precision (mAP).
The general steps are as follows: The general steps are as follows:
1. calculate the true positive and false positive according to the input 1. calculate the true positive and false positive according to the input
of detection and labels. of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'. 2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles: Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/ https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325 https://arxiv.org/abs/1512.02325
Args: Args:
...@@ -613,10 +617,12 @@ class DetectionMAP(object): ...@@ -613,10 +617,12 @@ class DetectionMAP(object):
for data in batches: for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example: In the above example:
- 'cur_map_v' is the mAP of current mini-batch.
- 'accum_map_v' is the accumulative mAP of one pass.
'cur_map_v' is the mAP of current mini-batch.
'accum_map_v' is the accumulative mAP of one pass.
""" """
def __init__(self, def __init__(self,
......
...@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest): ...@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
self.activation = 'relu' self.activation = 'relu'
self.add_bias = True self.add_bias = True
self.add_residual_data = True self.add_residual_data = True
self.channels = None
self.outputs = None
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
...@@ -49,8 +51,8 @@ class TestConv2dFusionOp(OpTest): ...@@ -49,8 +51,8 @@ class TestConv2dFusionOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups, self.output = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype(self.dtype) conv2d_param).astype(self.dtype)
self.inputs = { self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input), 'Input': OpTest.np_dtype_to_fluid_dtype(input),
...@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest): ...@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest):
} }
if self.add_residual_data: if self.add_residual_data:
residual_data = np.random.random(output.shape).astype(self.dtype) residual_data = np.random.random(self.output.shape).astype(
self.dtype)
self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
residual_data) residual_data)
output += residual_data self.output += residual_data
if self.add_bias: if self.add_bias:
bias = np.random.random(self.filter_size[0]).astype(self.dtype) bias = np.random.random(self.filter_size[0]).astype(self.dtype)
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias) self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
output = output + bias.reshape((1, bias.size, 1, 1)) self.output = self.output + bias.reshape((1, bias.size, 1, 1))
assert self.activation in ['relu', 'identity'] assert self.activation in ['relu', 'identity']
if self.activation == 'relu': if self.activation == 'relu':
output = np.maximum(output, 0) self.output = np.maximum(self.output, 0)
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
...@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest): ...@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest):
'dilations': self.dilations, 'dilations': self.dilations,
'data_format': self.data_format, 'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search, 'exhaustive_search': self.exhaustive_search,
'activation': self.activation 'activation': self.activation,
'split_channels': self.channels
} }
self.outputs = {'Output': output} self.outputs = {'Output': self.output}
self.set_outputs()
def testcuda(self): def testcuda(self):
return core.is_compiled_with_cuda() return core.is_compiled_with_cuda()
...@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest): ...@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest):
def set_search_method(self): def set_search_method(self):
self.exhaustive_search = False self.exhaustive_search = False
def set_outputs(self):
pass
class TestWithoutResidual(TestConv2dFusionOp): class TestWithoutResidual(TestConv2dFusionOp):
def init_bias_residual(self): def init_bias_residual(self):
...@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp): ...@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
self.exhaustive_search = True self.exhaustive_search = True
class TestMultipleOutputs(TestConv2dFusionOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [1, 32, 17, 17] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [126, f_c, 3, 3]
self.channels = [84, 42]
def set_outputs(self):
out1 = self.output[:, 0:84, :, :]
out2 = self.output[:, 84:126, :, :]
self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase): ...@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
self.assertIsNotNone(pool) self.assertIsNotNone(pool)
self.assertIsNotNone(mask) self.assertIsNotNone(mask)
self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
self.assertIsNotNone(pool)
self.assertIsNotNone(mask)
def test_adaptive_pool3d(self): def test_adaptive_pool3d(self):
program = Program() program = Program()
...@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase): ...@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
x, [3, 3, 3], require_index=True) x, [3, 3, 3], require_index=True)
self.assertIsNotNone(pool) self.assertIsNotNone(pool)
self.assertIsNotNone(mask) self.assertIsNotNone(mask)
self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
self.assertIsNotNone(pool)
self.assertIsNotNone(mask)
def test_lstm_unit(self): def test_lstm_unit(self):
program = Program() program = Program()
......
...@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): ...@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
var_dict = {} var_dict = {}
for var_proto in proto_list: for var_proto in proto_list:
var_name = str(var_proto.name) var_name = str(var_proto.name)
if (var_name not in np_list) and var_proto.dispensable:
continue
if is_input: if is_input:
if (var_name not in np_list) and var_proto.dispensable:
continue
assert (var_name in np_list) or (var_proto.dispensable), \ assert (var_name in np_list) or (var_proto.dispensable), \
"Missing {} as input".format(var_name) "Missing {} as input".format(var_name)
if var_proto.duplicable: if var_proto.duplicable:
......
...@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size): ...@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
class DistributeTranspilerConfig(object): class DistributeTranspilerConfig(object):
""" """
Args: .. py:attribute:: slice_var_up (bool)
slice_var_up (bool): Do Tensor slice for pservers, default is True.
split_method (PSDispatcher): RoundRobin or HashName can be used Do Tensor slice for pservers, default is True.
try to choose the best method to balance loads for pservers.
min_block_size (int): Minimum splitted element number in block. .. py:attribute:: split_method (PSDispatcher)
According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
RoundRobin or HashName can be used.
Try to choose the best method to balance loads for pservers.
.. py:attribute:: min_block_size (int)
Minimum number of splitted elements in block.
According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
We can use bandwidth effiently when data size is larger than 2MB.If you We can use bandwidth effiently when data size is larger than 2MB.If you
want to change it, please be sure you see the slice_variable function. want to change it, please be sure you have read the slice_variable function.
""" """
slice_var_up = True slice_var_up = True
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册