提交 f52c4f8b 编写于 作者: Y yaoxuefeng6

fix conflict

...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable) ...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50") set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
......
...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
VLOG(0) << "error: the number of ids is a negative number: " << num; VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (num == 0) { } else if (num == 0) {
VLOG(0) VLOG(0)
...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
"characters."; "characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (errno == ERANGE || num > INT_MAX) { } else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX"; VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} }
if (all_slots_type_[i] == "float") { if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtof(endptr, &endptr); strtof(endptr, &endptr);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for float"; "representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
} }
} else if (all_slots_type_[i] == "uint64") { } else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtoull(endptr, &endptr, 10); strtoull(endptr, &endptr, 10);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t"; "representable values for uint64_t";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( ...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float if ((*instance)[idx].GetType()[0] == 'f') { // float
...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) { ...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { ...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) { ...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
......
...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto ...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
eager_deletion_pass eager_deletion_pass
buffer_shared_inplace_op_pass buffer_shared_inplace_op_pass
buffer_shared_cross_op_memory_reuse_pass buffer_shared_cross_op_memory_reuse_pass
inplace_addto_op_pass
set_reader_device_info_utils set_reader_device_info_utils
add_reader_dependency_pass) add_reader_dependency_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLCommunicator *ctxs) const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) { : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#else #else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places) const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) { : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#endif #endif
...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles, const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) { const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size(); size_t num_places = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
in_var_handles.size(), num_places, platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal to the number of places."); "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places); "The NoDummyInputSize and NoDummyOutputSize should be "
"equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
in_var_handles.size(), out_var_handles.size()));
PADDLE_ENFORCE_EQ(
local_exec_scopes_.size(), num_places,
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places; std::vector<platform::Place> places;
...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i]; auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name()); auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.", PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
in_var_handles[i]->name()); "Variable %s is not found in local scope.",
in_var_handles[i]->name()));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) { if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel()); numel = static_cast<int64_t>(lod_tensor.numel());
// only enforce place0, we will enforce other palce numel == place0 numel // only enforce place0, we will enforce other palce numel == place0 numel
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
numel, 0, platform::errors::InvalidArgument( numel, 0,
"The numel of tensos=[%s] must > 0. But now numel=[%d]", platform::errors::PreconditionNotMet(
in_var_handles[i]->name(), numel)); "The numel of tensor %s should be > 0, but got numel is %d.",
in_var_handles[i]->name(), numel));
dtype = lod_tensor.type(); dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place()); is_gpu_place = platform::is_gpu_place(lod_tensor.place());
} }
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type()); numel, static_cast<int64_t>(lod_tensor.numel()),
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place())); platform::errors::PreconditionNotMet(
"The size of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(
dtype, lod_tensor.type(),
platform::errors::PreconditionNotMet(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
"in different local scopes should be equal."));
lod_tensor_data.emplace_back(lod_tensor.data<void>()); lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place()); places.emplace_back(lod_tensor.place());
...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name(); << ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), PADDLE_ENFORCE_EQ(
"The name of input and output should be equal."); in_var_handles[i]->name(), out_var_handles[i]->name(),
platform::errors::InvalidArgument(
"The name of input and output of all_reduce op should be equal, "
"but got input is %s and output is %s.",
in_var_handles[i]->name(), out_var_handles[i]->name()));
} }
std::vector<std::string> grad_var_names; std::vector<std::string> grad_var_names;
...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument(
"The nccl context should not be NULL."));
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
} }
NCCLAllReduceFunc(all_reduce_calls); NCCLAllReduceFunc(all_reduce_calls);
#else #else
PADDLE_THROW("Not compiled with CUDA."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { // Special handle CPU only Operator's gradient. Like CRF } else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0] auto &trg = *local_exec_scopes_[0]
......
...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( ...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
places_(std::move(places)), places_(std::move(places)),
graphs_(std::move(graphs)) { graphs_(std::move(graphs)) {
VLOG(3) << "build AsyncSSAGraphExecutor"; VLOG(3) << "build AsyncSSAGraphExecutor";
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size() strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "boost/optional.hpp" #include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
...@@ -119,6 +120,9 @@ struct BuildStrategy { ...@@ -119,6 +120,9 @@ struct BuildStrategy {
// Turn on inplace by default. // Turn on inplace by default.
bool enable_inplace_{true}; bool enable_inplace_{true};
// Turn off inplace addto by default.
bool enable_addto_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if // num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model. // it's distributed model.
......
...@@ -12,12 +12,14 @@ ...@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
bootstrap_ops_.emplace_back(op); bootstrap_ops_.emplace_back(op);
} }
} }
PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(op_deps_.size(), 0,
platform::errors::PreconditionNotMet(
"The graph doesn't have operators."));
PrepareAtomicOpDeps(); PrepareAtomicOpDeps();
} }
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() { ...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]); auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx()); auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name()); auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", PADDLE_ENFORCE_NOT_NULL(
var_handle->name()); var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>(); auto &t = var->Get<framework::LoDTensor>();
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include <map> #include <map>
#include <unordered_set> #include <unordered_set>
...@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE(!use_cuda);
#endif #endif
// skip running current op, used with inplace_addto_op_pass
if (skip_running_) {
VLOG(4) << "skip running: " << Name();
return;
}
RunImpl(); RunImpl();
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -52,6 +53,10 @@ class OpHandleBase { ...@@ -52,6 +53,10 @@ class OpHandleBase {
virtual Priority GetPriority() const { return kNormal; } virtual Priority GetPriority() const { return kNormal; }
virtual bool GetSkipRunning() const { return skip_running_; }
virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
virtual std::string Name() const = 0; virtual std::string Name() const = 0;
void Run(bool use_cuda); void Run(bool use_cuda);
...@@ -131,6 +136,7 @@ class OpHandleBase { ...@@ -131,6 +136,7 @@ class OpHandleBase {
std::map<platform::Place, platform::DeviceContext *> dev_ctxes_; std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
std::vector<Scope *> local_exec_scopes_; std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_; std::unordered_map<int, cudaEvent_t> events_;
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
places_(places), places_(places),
graphs_(std::move(graphs)), graphs_(std::move(graphs)),
feed_status_(places.size(), FeedStatus::kNone) { feed_status_(places.size(), FeedStatus::kNone) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(), PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
var_infos_(std::move(var_infos)), var_infos_(std::move(var_infos)),
places_(std::move(places)), places_(std::move(places)),
scope_monitor_(places_, local_exec_scopes_) { scope_monitor_(places_, local_exec_scopes_) {
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
PrepareLocalExeScopes(); PrepareLocalExeScopes();
} }
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) { ...@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>(); return var->Get<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
...@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) { ...@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>(); return var->GetMutable<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
ShareTensorBufferFunctor::ShareTensorBufferFunctor( ShareTensorBufferFunctor::ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: scope_(scope), : scope_(scope),
scope_idx_(scope_idx), scope_idx_(scope_idx),
op_type_(op_type), op_type_(op_type),
in_var_infos_(in_var_infos), in_var_infos_(in_var_infos),
out_var_names_(out_var_names) { out_var_names_(out_var_names),
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); share_dims_(share_dims) {
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
platform::errors::PreconditionNotMet(
"The number of input variables and output variables "
"should be equal, but got number of input variables is "
"%d and number of output variables is %d.",
in_var_infos_.size(), out_var_names_.size()));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
AddReuseVarPair(in_var_infos_[i], out_var_names_[i]); AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
} }
...@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const { ...@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
void ShareTensorBufferFunctor::AddReuseVarPair( void ShareTensorBufferFunctor::AddReuseVarPair(
const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) { const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); PADDLE_ENFORCE_NOT_NULL(
in_var_info,
platform::errors::InvalidArgument(
"The input variables to be inplaced should not be NULL."));
PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
"in/out cannot have same name: %s", out_var_name); platform::errors::InvalidArgument(
"The input variable and output variable to be inplaced "
"cannot have the same name: %s.",
out_var_name));
in_var_infos_.emplace_back(in_var_info); in_var_infos_.emplace_back(in_var_info);
out_var_names_.emplace_back(out_var_name); out_var_names_.emplace_back(out_var_name);
} }
void ShareTensorBufferFunctor::CallOnce() { void ShareTensorBufferFunctor::CallOnce() {
PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); PADDLE_ENFORCE(in_out_vars_.empty(),
platform::errors::InvalidArgument(
"The input-output variable pairs to be "
"inplaced should be initialized here."));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name()); auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_NOT_NULL(out_var); in_var, platform::errors::NotFound(
PADDLE_ENFORCE_NE(in_var, out_var); "The input variable(%s)to be inplaced should not be NULL.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"The output variable(%s) to be inplaced should not be NULL.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
platform::errors::PreconditionNotMet(
"The input variable and output variable to be inplaced "
"cannot be the same variable(%s).",
out_var_names_[i]));
in_out_vars_.emplace_back(in_var, out_var); in_out_vars_.emplace_back(in_var, out_var);
} }
} }
void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
if (!exec_scope_) { if (!exec_scope_) {
PADDLE_ENFORCE_NOT_NULL(exec_scope); PADDLE_ENFORCE_NOT_NULL(exec_scope,
platform::errors::InvalidArgument(
"The given execution scope should not be NULL "
"if the cached scope is NULL."));
exec_scope_ = exec_scope; exec_scope_ = exec_scope;
CallOnce(); CallOnce();
} else { } else {
PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same"); PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
platform::errors::InvalidArgument(
"The given execution scope and the cached execution "
"scope should be the same."));
} }
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
...@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { ...@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
} else { } else {
out_tensor->ShareBufferWith(in_tensor); out_tensor->ShareBufferWith(in_tensor);
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of
// output as the same as input.
if (share_dims_) {
out_tensor->Resize(in_tensor.dims());
}
VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
<< in_var_info->Name() << " -> " << out_var_names_[i]; << in_var_info->Name() << " -> " << out_var_names_[i];
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor { ...@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
ShareTensorBufferFunctor( ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
void operator()(Scope *exec_scope); void operator()(Scope *exec_scope);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor { ...@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
std::vector<std::string> out_var_names_; std::vector<std::string> out_var_names_;
std::vector<std::pair<const Variable *, Variable *>> in_out_vars_; std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of output
// as the same as input.
bool share_dims_{false};
}; };
} // namespace details } // namespace details
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle( ...@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
for (ir::Node *pending_op : out_var->outputs) { for (ir::Node *pending_op : out_var->outputs) {
auto &op = pending_op->Wrapper<OpHandleBase>(); auto &op = pending_op->Wrapper<OpHandleBase>();
auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op); auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
PADDLE_ENFORCE_NOT_NULL(compute_op); PADDLE_ENFORCE_NOT_NULL(
compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be ComputationOpHandle."));
if (result_op == nullptr) { if (result_op == nullptr) {
result_op = compute_op; result_op = compute_op;
} else { } else {
PADDLE_ENFORCE_EQ(result_op, compute_op); PADDLE_ENFORCE_EQ(
result_op, compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be the unique one."));
} }
} }
} }
PADDLE_ENFORCE_NOT_NULL(result_op); PADDLE_ENFORCE_NOT_NULL(result_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should not be NULL."));
return result_op; return result_op;
} }
ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: OpHandleBase(node), : OpHandleBase(node),
functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {} functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
share_dims) {}
std::unordered_map<std::string, std::string> std::unordered_map<std::string, std::string>
ShareTensorBufferOpHandle::ReusedVars() const { ShareTensorBufferOpHandle::ReusedVars() const {
...@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair( ...@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
functor_.AddReuseVarPair(in_var_info, out_var_name); functor_.AddReuseVarPair(in_var_info, out_var_name);
} }
void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
functor_.SetShareDims(share_dims);
}
void ShareTensorBufferOpHandle::InitCUDA() { void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int dev_id = int dev_id =
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
...@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ...@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
ir::Node *node, Scope *scope, size_t scope_idx, ir::Node *node, Scope *scope, size_t scope_idx,
const std::string &op_type, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_vars_infos, const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ...@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims);
const ShareTensorBufferFunctor &Functor() const { return functor_; } const ShareTensorBufferFunctor &Functor() const { return functor_; }
protected: protected:
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle { namespace paddle {
...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) { ...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr || PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr, dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
true, true,
"The input ops of ClearFetchOp function should be " platform::errors::PreconditionNotMet(
"FetchOpHandle or FetchAsyncOpHandle."); "The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle."));
for (auto& out_var : op->Node()->outputs) { for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var); graph->RemoveNode(out_var);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( ...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
} }
} }
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE_EQ(
ready_ops.empty(), true,
platform::errors::Fatal("After the execution of computation graph, "
"there are unexecuted operators left."));
} }
// Wait FetchOps. // Wait FetchOps.
...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
FetchResultType *fetch_data, bool return_merged) { FetchResultType *fetch_data, bool return_merged) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars; std::unordered_set<VarHandleBase *> local_ready_vars;
std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end()); for (auto &fetch_var_name : fetch_tensors) {
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
ready_ops->insert(static_cast<OpHandleBase *>(op)); ready_ops->insert(static_cast<OpHandleBase *>(op));
} }
} }
PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); PADDLE_ENFORCE_EQ(
local_ready_vars.size(), 0,
platform::errors::Fatal(
"The number of ready variables should be 0, but got %d.",
local_ready_vars.size()));
} }
void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingOp(
...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() { ...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
} }
} }
op_deps_->num_ops_ = ready_ops.size() + pending_ops.size(); op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(
op_deps_->num_ops_, 0,
platform::errors::InvalidArgument("The graph doesn't have operators."));
for (auto ready_var : ready_vars) { for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var); pending_vars.erase(ready_var);
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <ThreadPool.h> // ThreadPool in thrird party
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <list> #include <list>
...@@ -24,8 +26,6 @@ ...@@ -24,8 +26,6 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <ThreadPool.h> // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
......
...@@ -54,8 +54,10 @@ struct VarHandleBase { ...@@ -54,8 +54,10 @@ struct VarHandleBase {
void AddOutput(OpHandleBase* out, ir::Node* node) { void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) { if (pending_ops_.find(out) == pending_ops_.end()) {
PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(out,
this->Node()->Name()); platform::errors::InvalidArgument(
"The output added to VarHandle %s is NULL.",
this->Node()->Name()));
pending_ops_.insert(out); pending_ops_.insert(out);
node_->outputs.push_back(node); node_->outputs.push_back(node);
} }
...@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase { ...@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
bool HasEvent() { return has_event_; } bool HasEvent() { return has_event_; }
const cudaEvent_t& GetEvent() { const cudaEvent_t& GetEvent() {
PADDLE_ENFORCE(HasEvent(), "The event is not set."); PADDLE_ENFORCE_EQ(
HasEvent(), true,
platform::errors::PreconditionNotMet(
"The cuda event is not set, maybe InitCUDA() is not called."));
return event_; return event_;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) { ...@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>()); (*func)(var->GetMutable<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.",
ToTypeName(var->Type())));
} }
} }
...@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) { ...@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>()); (*func)(var.Get<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
} }
} }
...@@ -50,7 +54,8 @@ struct TensorVisitor { ...@@ -50,7 +54,8 @@ struct TensorVisitor {
template <typename T> template <typename T>
void operator()() { void operator()() {
PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name()); PADDLE_THROW(platform::errors::Unimplemented(
"Getting tensor from type %s is not supported.", typeid(T).name()));
} }
}; };
...@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor { ...@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
} }
}; };
...@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { ...@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
} }
struct EnforceShapeAndDTypeEQVisitor { struct EnforceShapeAndDTypeEQVisitor {
const Variable* trg_; const Variable* dst_;
void operator()(const LoDTensor& src) { void operator()(const LoDTensor& src) {
auto& tensor = trg_->Get<LoDTensor>(); auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
src.place().which(), tensor.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(), PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(), "The dtype of the two variables is not equal."));
"The dims of the two Variable is not equal."); PADDLE_ENFORCE_EQ(
src.dims(), tensor.dims(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(), PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
"The lod of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(), "The lod of the two variable is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.layout(), tensor.layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors tensor is not equal."));
} }
void operator()(const SelectedRows& src) { void operator()(const SelectedRows& src) {
auto& selected_rows = trg_->Get<SelectedRows>(); auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
src.place().which(), selected_rows.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(), "The dtype of the two variables is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.value().layout(), selected_rows.value().layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(), PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
"The height of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The height of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(), PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
"The dims of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The dims of the two variables is not equal."));
} }
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "EnforceShapeAndDTypeEQ is not supported for type %s.",
typeid(T).name()));
} }
}; };
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
namespace gloo { namespace gloo {
namespace rendezvous { namespace rendezvous {
constexpr int kNodeSize = 136;
HdfsStore::HdfsStore(const std::string& path) { HdfsStore::HdfsStore(const std::string& path) {
path_ = path; path_ = path;
wait_sleep_ms_ = 10000; wait_sleep_ms_ = 10000;
...@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh( ...@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
storeKey << rank; storeKey << rank;
store.set(storeKey.str(), allBytes); store.set(storeKey.str(), allBytes);
auto total_add_size = kNodeSize * (size - 1);
std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_); std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
// Connect every pair // Connect every pair
for (uint32_t i = 0; i < connect_threads.size(); ++i) { for (uint32_t i = 0; i < connect_threads.size(); ++i) {
connect_threads[i].reset(new std::thread( connect_threads[i].reset(new std::thread(
[&store, &transportContext, this](size_t thread_idx, [&store, &transportContext, total_add_size, this](
size_t thread_num) -> void { size_t thread_idx, size_t thread_num) -> void {
for (int i = thread_idx; i < size; i += thread_num) { for (int i = thread_idx; i < size; i += thread_num) {
if (i == rank) { if (i == rank) {
continue; continue;
...@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh( ...@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
// Wait for address of other side of this pair to become available // Wait for address of other side of this pair to become available
std::string key = std::to_string(i); std::string key = std::to_string(i);
store.wait({key}, getTimeout()); store.wait({key}, getTimeout());
std::vector<char> allAddrs;
auto max_retry_times = 5;
// Connect to other side of this pair // Connect to other side of this pair
auto allAddrs = store.get(key);
while (max_retry_times > 0) {
allAddrs = store.get(key);
VLOG(3) << "store get all address size: " << allAddrs.size()
<< " except: " << total_add_size;
if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
break;
}
--max_retry_times;
}
auto addr = extractAddress(allAddrs, i); auto addr = extractAddress(allAddrs, i);
transportContext->getPair(i)->connect(addr); transportContext->getPair(i)->connect(addr);
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass, ...@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass); paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("affine_channel", 0));
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass, ...@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass); paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass, REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass); paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("batch_norm", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("batch_norm", 0));
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAdd2ActFusePass); paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_act_fuse_pass, REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAddActFusePass); paddle::framework::ir::ConvElementwiseAddActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_fuse_pass, REGISTER_PASS(conv_elementwise_add_fuse_pass,
paddle::framework::ir::ConvElementwiseAddFusePass); paddle::framework::ir::ConvElementwiseAddFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0));
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
// Build pattern // Build pattern
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
->assert_is_op_input("lookup_table") ->assert_is_op_input("lookup_table_v2")
->assert_var_not_persistable(); ->assert_var_not_persistable();
patterns::Embedding embedding_pattern(pattern, name_scope); patterns::Embedding embedding_pattern(pattern, name_scope);
// TODO(jczaja): Intermediate can only be for val that are not used anywhere // TODO(jczaja): Intermediate can only be for val that are not used anywhere
...@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(embedding_fc_lstm_fuse_pass, REGISTER_PASS(embedding_fc_lstm_fuse_pass,
paddle::framework::ir::EmbeddingFCLSTMFusePass); paddle::framework::ir::EmbeddingFCLSTMFusePass);
REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("lookup_table_v2", 0)
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fused_embedding_fc_lstm", 0));
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { ...@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass) REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
.RequirePassAttr("use_gpu"); .RequirePassAttr("use_gpu");
REGISTER_PASS_CAPABILITY(fc_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("fc", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
auto* x_n = subgraph.at(x); auto* x_n = subgraph.at(x);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
...@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
gru_pattern); gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
// TODO(wilber): Support origin_mode=True.
if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
return;
}
if (with_fc_bias) { if (with_fc_bias) {
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias); gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes. // Remove unneeded nodes.
...@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass); REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass); REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass); REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
...@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl ...@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass)
cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op) cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
...@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { ...@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
VLOG(4) << "Inplace performed in op " << op_type << ": " VLOG(4) << "Inplace performed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> " << in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name() << out_var_handle_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString(); << ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
} else { } else {
VLOG(3) << "Inplace failed in op " << op_type << ": " VLOG(3) << "Inplace failed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> " << in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name(); << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
} }
} }
} }
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class InplaceAddToOpPass : public MemoryReusePass {
protected:
std::string ReuseType() const override { return "inplace_addto"; }
void Run(Graph *graph) const override;
private:
// 1. Add last living op of in_var, add any last living op of out_var
// 2. Set reference count of in_var to be 2
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const override {
size_t scope_idx = op->GetScopeIdx();
auto *last_live_ops_of_vars_ =
&Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
auto out_var_op_iter =
(*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
// In Reduce mode, some output variable(gradient of parameter) does not have
// last live ops
details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
last_live_op_of_in_var = op;
} else {
PADDLE_ENFORCE_EQ(
out_var_op_iter->second.ops().empty(), false,
platform::errors::InvalidArgument(
"Var(%s)'s last live op should not empty.", out_var->Name()));
last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
}
auto *last_live_ops_of_in_var =
(*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
// last_live_ops_of_in_var->clear();
last_live_ops_of_in_var->insert(last_live_op_of_in_var);
auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
PADDLE_ENFORCE_NE(
in_var_info_iter, (*var_infos_)[scope_idx].end(),
platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
in_var_info_iter->second->SetRefCnt(2); // before inplace, it is 1
}
};
void InplaceAddToOpPass::Run(Graph *graph) const {
const auto &last_live_ops =
Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
bool use_cuda = Get<bool>(kUseCuda);
// Currently, only perform InplaceAddToOpPass on cuda place
if (!use_cuda) {
return;
}
// Step 1: Build a reverse map of last_live_ops
// i.e.: op -> vars
std::unordered_map<details::ComputationOpHandle *,
std::unordered_map<std::string, ir::Node *>>
candidate_ops;
for (auto &each_scope_ops : last_live_ops) {
for (auto &pair : each_scope_ops) {
// If variable has more than 1 last lived ops, this variable cannot
// be inplaced.
if (pair.second.ops().size() != 1) {
continue;
}
auto *op = *(pair.second.ops().begin());
const std::string &op_type = op->GetOp()->Type();
const framework::OpDesc *op_desc = op->Node()->Op();
PADDLE_ENFORCE_NOT_NULL(
op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
op->Name()));
// only grad op should be processed.
if (op_type != "grad_add") {
continue;
}
const std::string &var_name = pair.first;
auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
if (in_nodes.size() == 1) {
candidate_ops[op][var_name] = *in_nodes.begin();
}
VLOG(4) << "Find op " << op_type << " with input(" << var_name
<< ") that can do inplace add to";
}
}
// Step 2: Check which vars can be inplaced indeed
for (auto &op_vars_pair : candidate_ops) {
auto *op = op_vars_pair.first;
// The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
// could be changed as follws if inplace addto is enabled:
// g_sum_0 = g_0
// g_sum_1 = grad_add(g_sum_0, g_1)
// g_sum_2 = grad_add(g_sum_1, g_2)
// ...
// g_sum_n = grad_add(g_sum_n-1, g_n)
// here we will add inplace for each grad_add, for example, for the first
// grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
const std::string &op_type = op->GetOp()->Type();
PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
platform::errors::InvalidArgument(
"The size of inputs of %s should be 2, but got %d",
op_type, op->Node()->inputs.size()));
PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
platform::errors::InvalidArgument(
"The size of outputs of %s should be 1, but got %d",
op_type, op->Node()->outputs.size()));
auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
out_var_ptr == nullptr) {
continue;
}
// auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
// left_var_ptr->GeneratedOp());
auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
right_var_ptr->GeneratedOp());
auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
out_var_ptr->GeneratedOp());
// NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
if (right_generated_op->Name() != "conv2d_grad") {
continue;
}
// NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
// before b. However, in the situation of inplace addto, we do not care
// the order, since a+b is equal to b+a. Is there any exception for that?
// AddDependencyVar(right_generated_op, left_generated_op);
// no need, as discussed above.
// step (a): inplace right_var->left_var of grad_add
this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
<< ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
<< ". Debug String is: "
<< right_generated_op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (b): inplace out -> right_var of grad_add
this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
VLOG(4) << "Inplace performed in op " << op_type << ": "
<< left_var_ptr->Name() << " -> " << out_var_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (c): make right_var cannot inplace afterwards. canbe done
// aotomatically since CollectReusedVars is called before any reuse.
// step (d): make right_var's generated op use addto
right_generated_op->GetOp()->SetAttr("use_addto", true);
// step (e): make grad_add skip running
op->SetSkipRunning(true);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
.RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
.RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
.RequirePassAttr(paddle::framework::ir::kUseCuda);
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include <functional> #include <functional>
#include <map> #include <map>
#include <string> #include <string>
...@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var, ...@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
out_var->Name())); out_var->Name()));
if (IsVarPairReusable(*in_var, *out_var)) { if (IsVarPairReusable(*in_var, *out_var)) {
AddReuseVar(op, in_var, out_var); AddReuseVar(op, in_var, out_var);
UpdateLastLiveOpOfVar(op, in_var, out_var);
return true; return true;
} else { } else {
return false; return false;
...@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable( ...@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
details::VarHandle *in_var, details::VarHandle *in_var,
details::VarHandle *out_var) const { details::VarHandle *out_var,
bool share_dims) const {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
(*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0, (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
platform::errors::NotFound("Var(%s) does not in mem opt var infos.", platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
...@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, ...@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
share_buffer_op->AddInput(in_var); share_buffer_op->AddInput(in_var);
} }
if (share_dims) {
share_buffer_op->SetShareDims(true);
}
share_buffer_op->AddReuseVarPair( share_buffer_op->AddReuseVarPair(
(*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(), (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
out_var->Name()); out_var->Name());
reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name()); reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name()); reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
UpdateLastLiveOpOfVar(op, in_var, out_var);
} }
// 1. Set last living op of in_var to be any last living op of out_var // 1. Set last living op of in_var to be any last living op of out_var
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
...@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass { ...@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
int64_t GetMemorySize(const details::VarHandle &var) const; int64_t GetMemorySize(const details::VarHandle &var) const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var, bool share_dims = false) const;
virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private: private:
VarDesc *GetVarDesc(const details::VarHandle &var) const; VarDesc *GetVarDesc(const details::VarHandle &var) const;
...@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass { ...@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
void CollectReusedVars() const; void CollectReusedVars() const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var) const;
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private: private:
mutable Graph *graph_; mutable Graph *graph_;
mutable bool use_cuda_; mutable bool use_cuda_;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#define MAX_NUM_FC 10 #define MAX_NUM_FC 10
...@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, ...@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) { if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
return false; return false;
} }
if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
LOG(WARNING) << "repeated fc relu only supports input dims = 2";
return false;
}
int fc_idx = FindFCIdx(x); int fc_idx = FindFCIdx(x);
if (fc_idx < 0) { if (fc_idx < 0) {
return false; return false;
...@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(repeated_fc_relu_fuse_pass, REGISTER_PASS(repeated_fc_relu_fuse_pass,
paddle::framework::ir::RepeatedFCReluFusePass); paddle::framework::ir::RepeatedFCReluFusePass);
REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("fc", 0)
.EQ("relu", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h" #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { ...@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "shufflechannel_pattern"; const std::string pattern_name = "shufflechannel_pattern";
FusePassBase::Init(pattern_name, graph); FusePassBase::Init(pattern_name, graph);
LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
"use it instead of (reshape + transpose +reshape)";
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern() auto* x = gpd.mutable_pattern()
->NewNode("x") ->NewNode("x")
...@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { ...@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(shuffle_channel_detect_pass, REGISTER_PASS(shuffle_channel_detect_pass,
paddle::framework::ir::ShuffleChannelDetectPass); paddle::framework::ir::ShuffleChannelDetectPass);
REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("reshape2", 0)
.EQ("transpose2", 0));
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
}; };
auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) { auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
bool basic = var_is_op_input(x, "matmul", arg_name) && bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
var_is_op_input(x, "matmul", arg_name)) &&
var_is_op_input(x, "square", "X"); var_is_op_input(x, "square", "X");
if (!basic) { if (!basic) {
return false; return false;
...@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
} }
auto* squared_x = squared_x_op->outputs[0]; auto* squared_x = squared_x_op->outputs[0];
bool next_is_matmul_from_arg = bool next_is_matmul_from_arg =
var_is_op_input(squared_x, "matmul", arg_name) && (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
var_is_op_input(squared_x, "matmul", arg_name)) &&
squared_x->outputs.size() == 1 && squared_x->outputs.size() == 1 &&
squared_x->outputs[0]->outputs.size() == 1; squared_x->outputs[0]->outputs.size() == 1;
if (!next_is_matmul_from_arg) { if (!next_is_matmul_from_arg) {
...@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_first_mul_out = [=](Node* x) -> bool { auto is_fusion_first_mul_out = [=](Node* x) -> bool {
bool input_is_matmul_op = x && x->inputs.size() == 1 && bool input_is_matmul_op = x && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() && x->inputs[0]->IsOp() &&
x->inputs[0]->Op()->Type() == "matmul"; (x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!input_is_matmul_op) { if (!input_is_matmul_op) {
return false; return false;
} }
...@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_xy_op = pattern->NewNode( auto* matmul_xy_op = pattern->NewNode(
[=](Node* x) { [=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" && return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_first_mul_out(x->outputs[0]); is_fusion_first_mul_out(x->outputs[0]);
}, },
name_scope + "/matmul_xy_op"); name_scope + "/matmul_xy_op");
...@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool { auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
bool basic = x && x->IsVar() && x->inputs.size() == 1 && bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul"; x->inputs[0]->IsOp() &&
(x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!basic) { if (!basic) {
return false; return false;
} }
...@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_squared_x_y_op = pattern->NewNode( auto* matmul_squared_x_y_op = pattern->NewNode(
[=](Node* x) { [=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" && return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_mat_squared_x_y_op_out(x->outputs[0]); is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
}, },
name_scope + "/matmul_squared_x_y_op"); name_scope + "/matmul_squared_x_y_op");
...@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(squared_mat_sub_fuse_pass, REGISTER_PASS(squared_mat_sub_fuse_pass,
paddle::framework::ir::SquaredMatSubFusePass); paddle::framework::ir::SquaredMatSubFusePass);
REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("matmul", 0)
.EQ("matmul_v2", 0)
.EQ("square", 0)
.EQ("elementwise_mul", 0)
.EQ("elementwise_sub", 0)
.EQ("fill_constant", 0)
.EQ("fusion_squared_mat_sub", 0));
...@@ -24,7 +24,7 @@ namespace framework { ...@@ -24,7 +24,7 @@ namespace framework {
namespace ir { namespace ir {
/** /**
* Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
*/ */
class SquaredMatSubFusePass : public FusePassBase { class SquaredMatSubFusePass : public FusePassBase {
public: public:
......
...@@ -157,6 +157,14 @@ class OperatorBase { ...@@ -157,6 +157,14 @@ class OperatorBase {
platform::errors::NotFound("(%s) is not found in AttributeMap.", name)); platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
return BOOST_GET_CONST(T, attrs_.at(name)); return BOOST_GET_CONST(T, attrs_.at(name));
} }
void SetAttr(const std::string& name, const Attribute& v) {
PADDLE_ENFORCE_EQ(
HasAttr(name), true,
platform::errors::NotFound(
"The attribute %s is not found in operator %s", name, Type()));
attrs_[name] = v;
}
const AttributeMap& Attrs() const { return attrs_; } const AttributeMap& Attrs() const { return attrs_; }
const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Inputs() const { return inputs_; }
......
...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -108,6 +110,11 @@ class ParallelExecutorPrivate { ...@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
* them. * them.
*/ */
inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) { inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
if (mem_opt_var_infos_.size() == 0) {
VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
"optimization strategy is enabled";
return;
}
auto iter = mem_opt_var_infos_[scope_idx].find(name); auto iter = mem_opt_var_infos_[scope_idx].find(name);
if (iter != mem_opt_var_infos_[scope_idx].end()) { if (iter != mem_opt_var_infos_[scope_idx].end()) {
iter->second->SetSkipMemoryReuse(true); iter->second->SetSkipMemoryReuse(true);
...@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
} }
bool need_mem_opt = build_strategy_.enable_inplace_ || bool need_mem_opt = build_strategy_.enable_inplace_ ||
build_strategy_.enable_addto_ ||
build_strategy_.memory_optimize_.get() || is_gc_enabled; build_strategy_.memory_optimize_.get() || is_gc_enabled;
if (!need_mem_opt) return graph; if (!need_mem_opt) return graph;
...@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
graph = ref_cnt_pass->Apply(graph); graph = ref_cnt_pass->Apply(graph);
VLOG(10) << "ReferenceCountPass Applied"; VLOG(10) << "ReferenceCountPass Applied";
if (build_strategy_.enable_addto_) {
auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
VLOG(10) << "Start to apply inplace_addto_op_pass";
graph = addto_pass->Apply(graph);
VLOG(10) << "inplace_addto_op_pass Applied";
}
if (build_strategy_.enable_inplace_) { if (build_strategy_.enable_inplace_) {
auto inplace_pass = auto inplace_pass =
ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
...@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass); ...@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass); USE_PASS(eager_deletion_pass);
USE_PASS(buffer_shared_inplace_pass); USE_PASS(buffer_shared_inplace_pass);
USE_PASS(buffer_shared_cross_op_memory_reuse_pass); USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
USE_PASS(inplace_addto_op_pass);
...@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// "seqpool_concat_fuse_pass", // // "seqpool_concat_fuse_pass", //
"seqpool_cvm_concat_fuse_pass", // "seqpool_cvm_concat_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // // TODO(wilber): fix correctness problem.
// "fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", // "fc_gru_fuse_pass", //
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
......
...@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
plugin::DynamicPluginTensorRT* plugin = nullptr; auto use_fp16 = engine_->WithFp16();
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>( auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps); eps, use_fp16);
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -32,13 +32,34 @@ namespace plugin { ...@@ -32,13 +32,34 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
template <typename T> template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::initialize() { EmbEltwiseLayernormPluginDynamicImpl<
T>::~EmbEltwiseLayernormPluginDynamicImpl() {
this->terminate();
}
inline half fp32tofp16(float x) { return static_cast<half>(x); }
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
embs_gpu_.resize(embs_.size()); embs_gpu_.resize(embs_.size());
for (int i = 0; i < embs_.size(); i++) { for (int i = 0; i < embs_.size(); i++) {
if (embs_[i]) { if (embs_[i]) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); T *host_ptr;
cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float), auto size = emb_sizes_[i];
if (std::is_same<T, half>::value) {
host_ptr = new T[size];
std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
} else {
host_ptr = reinterpret_cast<T *>(embs_[i]);
}
cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
if (std::is_same<T, half>::value) {
delete[] host_ptr;
}
} }
} }
...@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() { ...@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
} }
int input_num = embs_.size();
in_ptr_tensor_.Resize({input_num});
emb_ptr_tensor_.Resize({input_num});
cudaGetDevice(&device_id_);
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
cudaMemcpyHostToDevice);
return 0; return 0;
} }
template <typename T> template <typename T>
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
for (int i = 0; i < embs_gpu_.size(); ++i) {
if (embs_gpu_[i]) {
cudaFree(embs_gpu_[i]);
embs_gpu_[i] = nullptr;
}
}
if (bias_gpu_) {
cudaFree(bias_gpu_);
bias_gpu_ = nullptr;
}
if (scale_gpu_) {
cudaFree(scale_gpu_);
scale_gpu_ = nullptr;
}
}
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims;
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
auto in_ptr_gpu_d =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
if (old_input_ptr_ != new_input_ptr) {
old_input_ptr_ = new_input_ptr;
cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
stream);
}
auto out_type = output_desc[0].type;
if (std::is_same<T, float>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (std::is_same<T, half>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
auto *output_d = reinterpret_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
return cudaGetLastError() != cudaSuccess;
}
template class EmbEltwiseLayernormPluginDynamicImpl<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamicImpl<half>;
#endif // SUPPORTS_CUDA_FP16
int EmbEltwiseLayernormPluginDynamic::initialize() {
impl_->initialize();
return 0;
}
void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { // NOLINT nvinfer1::IExprBuilder &expr_builder) { // NOLINT
PADDLE_ENFORCE_EQ(output_index, 0, PADDLE_ENFORCE_EQ(output_index, 0,
...@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( ...@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret; return ret;
} }
template <typename T> bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}
if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}
template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) { int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
"The EmbEltwiseLayerNorm's output should be one" "The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.", "but it's (%d) outputs.",
nb_outputs)); nb_outputs));
PADDLE_ENFORCE_EQ(nb_outputs, 1,
platform::errors::InvalidArgument(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.",
nb_outputs));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs, pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the " platform::errors::InvalidArgument("The pos(%d) should be less than the "
...@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
} }
if (pos == all_nums - 1) { if (pos == all_nums - 1) {
if (sizeof(T) == sizeof(float)) { if (with_fp16_ == false) {
return desc.type == nvinfer1::DataType::kFLOAT; return desc.type == nvinfer1::DataType::kFLOAT;
} else { } else {
return desc.type == nvinfer1::DataType::kHALF; return desc.type == nvinfer1::DataType::kHALF;
...@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
return false; return false;
} }
template <typename T> nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const { int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
index, 0, platform::errors::InvalidArgument( index, 0, platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only has one input, so the " "The EmbEltwiseLayernorm Plugin only has one input, so the "
"index value should be 0, but get %d.", "index value should be 0, but get %d.",
index)); index));
return nvinfer1::DataType::kFLOAT; if (with_fp16_)
return nvinfer1::DataType::kHALF;
else
return nvinfer1::DataType::kFLOAT;
} }
template <typename T> int EmbEltwiseLayernormPluginDynamic::enqueue(
int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) { void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims; impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
framework::Tensor in_ptr_tensor, emb_ptr_tensor;
int device_id;
cudaGetDevice(&device_id);
in_ptr_tensor.Resize({input_num});
emb_ptr_tensor.Resize({input_num});
int64_t *in_ptr_gpu_d =
in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
}
cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
auto out_type = output_desc[0].type;
const unsigned tpb = 256;
const dim3 grid(seq_len, batch, 1);
const dim3 block(tpb, 1, 1);
if (sizeof(T) == sizeof(float)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (sizeof(T) == sizeof(int16_t)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
T *output_d = static_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
template class EmbEltwiseLayernormPluginDynamic<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamic<half>;
#endif // SUPPORTS_CUDA_FP16
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -27,14 +27,76 @@ namespace tensorrt { ...@@ -27,14 +27,76 @@ namespace tensorrt {
namespace plugin { namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class EmbEltwiseLayernormPluginDynamicImplBase {
public:
EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual int initialize() = 0;
virtual void terminate() = 0;
virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) = 0;
};
template <typename T> template <typename T>
class EmbEltwiseLayernormPluginDynamicImpl
: public EmbEltwiseLayernormPluginDynamicImplBase {
public:
explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
float* bias, float* scale,
std::vector<int> emb_sizes,
int bias_size, int scale_size,
int hidden_size, float eps)
: embs_(input_embs),
bias_(bias),
scale_(scale),
emb_sizes_(emb_sizes),
bias_size_(bias_size),
scale_size_(scale_size),
hidden_size_(hidden_size),
eps_(eps) {}
~EmbEltwiseLayernormPluginDynamicImpl();
int initialize();
void terminate();
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream);
private:
std::vector<float*> embs_;
float* bias_{nullptr};
float* scale_{nullptr};
// data on devices
float* bias_gpu_{nullptr};
float* scale_gpu_{nullptr};
std::vector<T*> embs_gpu_;
std::vector<int> emb_sizes_;
int bias_size_;
int scale_size_;
int hidden_size_;
float eps_;
framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
int device_id_{0};
uintptr_t old_input_ptr_{0};
};
class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs, explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
float* bias, float* scale, float* bias, float* scale,
std::vector<int> emb_sizes, std::vector<int> emb_sizes,
int bias_size, int scale_size, int bias_size, int scale_size,
int hidden_size, float eps) int hidden_size, float eps,
bool with_fp16)
: embs_(input_embs), : embs_(input_embs),
bias_(bias), bias_(bias),
scale_(scale), scale_(scale),
...@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
bias_size_(bias_size), bias_size_(bias_size),
scale_size_(scale_size), scale_size_(scale_size),
hidden_size_(hidden_size), hidden_size_(hidden_size),
eps_(eps) {} eps_(eps),
with_fp16_(with_fp16),
own_host_buff_(false) {
if (with_fp16) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
}
EmbEltwiseLayernormPluginDynamic(void const* serial_data, EmbEltwiseLayernormPluginDynamic(void const* serial_data,
size_t serial_length) { size_t serial_length)
: own_host_buff_(true) {
DeserializeValue(&serial_data, &serial_length, &emb_sizes_); DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
embs_gpu_.resize(emb_sizes_.size());
embs_.resize(emb_sizes_.size()); embs_.resize(emb_sizes_.size());
for (size_t i = 0; i < emb_sizes_.size(); i++) { for (size_t i = 0; i < emb_sizes_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); auto size = emb_sizes_[i];
cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float), auto ptr = new float[size];
cudaMemcpyHostToDevice); memcpy(ptr, serial_data, sizeof(float) * size);
embs_[i] = ptr;
reinterpret_cast<char const*&>(serial_data) += reinterpret_cast<char const*&>(serial_data) +=
emb_sizes_[i] * sizeof(float); emb_sizes_[i] * sizeof(float);
serial_length -= emb_sizes_[i] * sizeof(float); serial_length -= emb_sizes_[i] * sizeof(float);
embs_[i] = nullptr;
} }
DeserializeValue(&serial_data, &serial_length, &bias_size_); DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_); DeserializeValue(&serial_data, &serial_length, &scale_size_);
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); if (bias_size_) {
cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float), bias_ = new float[bias_size_];
cudaMemcpyHostToDevice); memcpy(bias_, serial_data, sizeof(float) * bias_size_);
bias_ = nullptr; }
reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float); reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
serial_length -= bias_size_ * sizeof(float); serial_length -= bias_size_ * sizeof(float);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); if (scale_size_) {
cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float), scale_ = new float[scale_size_];
cudaMemcpyHostToDevice); memcpy(scale_, serial_data, sizeof(float) * scale_size_);
scale_ = nullptr; }
reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float); reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
serial_length -= scale_size_ * sizeof(float); serial_length -= scale_size_ * sizeof(float);
DeserializeValue(&serial_data, &serial_length, &hidden_size_); DeserializeValue(&serial_data, &serial_length, &hidden_size_);
DeserializeValue(&serial_data, &serial_length, &eps_); DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
if (with_fp16_) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
auto ptr = new EmbEltwiseLayernormPluginDynamic( auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_); eps_, with_fp16_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr; return ptr;
} }
...@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
} }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
void terminate() override;
size_t getSerializationSize() const override { size_t getSerializationSize() const override {
int sum_num = 0; int sum_num = 0;
...@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
sum_num += (bias_size_ + scale_size_) * sizeof(float); sum_num += (bias_size_ + scale_size_) * sizeof(float);
sum_num += SerializedSize(hidden_size_); sum_num += SerializedSize(hidden_size_);
sum_num += SerializedSize(eps_); sum_num += SerializedSize(eps_);
// sum_num += SerializedSize(with_fp16_); sum_num += SerializedSize(with_fp16_);
return sum_num; return sum_num;
} }
void terminate() override;
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_); SerializeValue(&buffer, emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) { for (size_t i = 0; i < emb_sizes_.size(); i++) {
SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]); auto size = emb_sizes_[i];
for (int j = 0; j < size; ++j) {
SerializeValue(&buffer, embs_[i][j]);
}
} }
SerializeValue(&buffer, bias_size_); SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_); SerializeValue(&buffer, scale_size_);
SerializeCudaPointer(&buffer, bias_gpu_, bias_size_); for (int i = 0; i < bias_size_; ++i) {
SerializeCudaPointer(&buffer, scale_gpu_, scale_size_); SerializeValue(&buffer, bias_[i]);
}
for (int i = 0; i < scale_size_; ++i) {
SerializeValue(&buffer, scale_[i]);
}
SerializeValue(&buffer, hidden_size_); SerializeValue(&buffer, hidden_size_);
SerializeValue(&buffer, eps_); SerializeValue(&buffer, eps_);
SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
...@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
const nvinfer1::DataType* input_types, const nvinfer1::DataType* input_types,
int nb_inputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override {
if (own_host_buff_) {
for (auto ptr : embs_) {
delete[] ptr;
}
delete[] bias_;
delete[] scale_;
}
delete impl_;
delete this;
}
private: private:
std::vector<float*> embs_; std::vector<float*> embs_;
float* bias_; float* bias_;
float* scale_; float* scale_;
// data on devices
float* bias_gpu_;
float* scale_gpu_;
std::vector<float*> embs_gpu_;
std::vector<int> emb_sizes_; std::vector<int> emb_sizes_;
int bias_size_; int bias_size_;
int scale_size_; int scale_size_;
int hidden_size_; int hidden_size_;
float eps_; float eps_;
bool with_fp16_;
bool own_host_buff_{false};
EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
}; };
class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
...@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { ...@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
nvinfer1::IPluginV2* deserializePlugin(const char* name, nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data, const void* serial_data,
size_t serial_length) override { size_t serial_length) override {
return new EmbEltwiseLayernormPluginDynamic<float>(serial_data, return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
serial_length);
} }
void setPluginNamespace(const char* lib_namespace) override { void setPluginNamespace(const char* lib_namespace) override {
......
...@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run(config, &out_data); // serialize run(config, &out_data); // serialize
run(*config_deser, &out_data); // deserialize run(*config_deser, &out_data); // deserialize
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6); EXPECT_NEAR(result[i], out_data[i], 1e-2);
} }
} }
...@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) { ...@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187}; std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result); trt_ernie(false, result);
} }
TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef SUPPORTS_CUDA_FP16
std::vector<float> result = {0.598336, 0.219558, 0.182106}; TEST(AnalysisPredictor, fp16) {
std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
trt_ernie(true, result); trt_ernie(true, result);
#endif
} }
#endif // SUPPORTS_CUDA_FP16
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#endif #endif
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
...@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
if (input_grad) { if (input_grad) {
// Because beta is zero, it is unnecessary to reset input_grad. // When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
...@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ctx, &transformed_input_grad_channel, input_grad); ctx, &transformed_input_grad_channel, input_grad);
} }
} }
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
...@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_data + i * group_offset_in, args2.odesc.desc(), input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size, &beta, args2.wdesc.desc(), workspace_size, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter)); filter_grad_data + i * group_offset_filter));
}, },
workspace_size); workspace_size);
...@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups; int group_offset_filter = W->numel() / groups;
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
if (ddO) { if (ddO) {
......
...@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() { ...@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection", AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used " "(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
...@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() { ...@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection", AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used " "(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
......
...@@ -54,6 +54,8 @@ class ScopedRNNBase { ...@@ -54,6 +54,8 @@ class ScopedRNNBase {
x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x)); x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y)); y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
} }
#if CUDNN_VERSION >= 7201
if (!sequence_length.empty()) { if (!sequence_length.empty()) {
x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true, x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
sequence_length); sequence_length);
...@@ -61,6 +63,7 @@ class ScopedRNNBase { ...@@ -61,6 +63,7 @@ class ScopedRNNBase {
hidden_size_ * numDirections, true, hidden_size_ * numDirections, true,
sequence_length); sequence_length);
} }
#endif
// ------------------- cudnn hx, hy, cx, cy descriptors---------- // ------------------- cudnn hx, hy, cx, cy descriptors----------
std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_, std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
...@@ -96,10 +99,13 @@ class ScopedRNNBase { ...@@ -96,10 +99,13 @@ class ScopedRNNBase {
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
cudnn_type)); cudnn_type));
#endif #endif
#if CUDNN_VERSION >= 7201
if (!sequence_length.empty()) { if (!sequence_length.empty()) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
} }
#endif
// ------------------- cudnn weights_size --------------------- // ------------------- cudnn weights_size ---------------------
size_t weights_size_; size_t weights_size_;
...@@ -125,8 +131,10 @@ class ScopedRNNBase { ...@@ -125,8 +131,10 @@ class ScopedRNNBase {
} }
cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); } cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); } cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
#if CUDNN_VERSION >= 7201
cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); } cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); } cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
#endif
cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); } cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); } cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); } cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
...@@ -151,8 +159,10 @@ class ScopedRNNBase { ...@@ -151,8 +159,10 @@ class ScopedRNNBase {
platform::ScopedTensorDescriptor x_desc_; platform::ScopedTensorDescriptor x_desc_;
platform::ScopedTensorDescriptor y_desc_; platform::ScopedTensorDescriptor y_desc_;
#if CUDNN_VERSION >= 7201
platform::ScopedRNNTensorDescriptor x_seq_desc_; platform::ScopedRNNTensorDescriptor x_seq_desc_;
platform::ScopedRNNTensorDescriptor y_seq_desc_; platform::ScopedRNNTensorDescriptor y_seq_desc_;
#endif
platform::ScopedTensorDescriptor init_h_desc_; platform::ScopedTensorDescriptor init_h_desc_;
platform::ScopedTensorDescriptor init_c_desc_; platform::ScopedTensorDescriptor init_c_desc_;
platform::ScopedTensorDescriptor last_h_desc_; platform::ScopedTensorDescriptor last_h_desc_;
......
...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle { namespace paddle {
...@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL( ...@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
int>, int>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext, ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>); int64_t>);
// A specialization elementwise_add operator, used in gradient accumulation with
// inplace addto.
REGISTER_OPERATOR(
grad_add, paddle::operators::ElementwiseOp,
paddle::operators::ElementwiseAddOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
grad_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>, ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
plat::float16>); plat::float16>);
REGISTER_OP_CUDA_KERNEL(
grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> { ...@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
float>; float>;
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
auto* scale_data = scale.data<T>();
auto* in_data = in.data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
auto in_dims = in.dims();
const int64_t channel = in_dims[quant_axis];
platform::Transform<platform::CPUDeviceContext> trans;
if (quant_axis == 0) {
const int64_t channel_size = in.numel() / channel;
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
auto* start = in_data + i * channel_size;
auto* end = in_data + (i + 1) * channel_size;
trans(ctx, start, end, out_data + i * channel_size,
ClipFunctor<T>(-s, s));
}
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
T inv_s = inverse(s);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
out_e.device(*ctx.eigen_device()) =
(bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
}
} else if (quant_axis == 1) {
const int64_t step_i = in.numel() / in_dims[0];
const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
for (int i = 0; i < in_dims[0]; i++) {
for (int j = 0; j < in_dims[1]; j++) {
T s = scale_data[j];
T inv_s = inverse(s);
auto* start = in_data + i * step_i + j * step_j;
auto* end = in_data + i * step_i + (j + 1) * step_j;
auto* cur_out_data = out_data + i * step_i + j * step_j;
trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
for (int k = 0; k < step_j; k++) {
cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
s / static_cast<T>(bin_cnt);
}
}
}
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
float>;
template <typename T> template <typename T>
struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> { struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, void operator()(const platform::CPUDeviceContext& ctx,
...@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$ ...@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
} }
}; };
class FakeChannelWiseQuantizeDequantizeAbsMaxOp
: public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
"FakeChannelWiseQuantizeDequantizeAbsMax");
int quant_axis = ctx->Attrs().Get<int>("quant_axis");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized and dequantized low level tensor, "
"saved as float data type.");
AddOutput("OutScale", "(Tensor) Current channel wise scale");
AddAttr<int>("quant_axis",
"(int, default 0) The axis for quantization. "
"For conv2d, depthwise_conv2d, conv2d_transpose "
"and mul, the quant_axis is equal to the cout axis.")
.SetDefault(0)
.AddCustomChecker([](const int& quant_axis) {
PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
});
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
platform::errors::InvalidArgument(
"'bit_length' should be between 1 and 16, but "
"the received is %d",
bit_length));
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeRangeAbsMaxOp(const std::string& type, FakeQuantizeRangeAbsMaxOp(const std::string& type,
...@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale, ...@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp); REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad, REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CPU, float>); ops::FakeQuantDequantGradKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
...@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
} }
}; };
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, // ChannelClipAndQuantDequantKernel for quant_axis is 0
float>; template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
const T* in, const T* scale, const int bin_cnt, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
// ChannelClipAndQuantDequantKernel for quant_axis is 1
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
// At present, channelwise quantization supports conv2d, depthwise_conv2d
// conv2d_transpose and mul
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis0<
T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis1<
T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>); ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad, REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CUDA, float>); ops::FakeQuantDequantGradKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
...@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor { ...@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
const int quant_axis, framework::Tensor* out); const int quant_axis, framework::Tensor* out);
}; };
template <typename DeviceContext, typename T>
struct ChannelClipFakeQuantDequantFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& scale, const int bin_cnt,
const int quant_axis, framework::Tensor* out);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor { struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
...@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
int quant_axis = context.Attr<int>("quant_axis");
FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
out_scale_data);
ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
......
...@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) ...@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>); rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -15,4 +15,5 @@ limitations under the License. */ ...@@ -15,4 +15,5 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>); rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel { ...@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size(); const int& dim_size = input_dims.size();
const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis")); int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true, PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
"the axis of topk" "the axis of topk"
...@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel { ...@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
if (axis < 0) axis += dim_size; if (axis < 0) axis += dim_size;
PADDLE_ENFORCE_GE( int k;
k, 1, "the attribute of k in the topk must >= 1, but received %d .", k); auto k_is_tensor = ctx->HasInput("K");
if (k_is_tensor) {
k = -1;
} else {
k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_EQ(k >= 1, true,
"the attribute of k in the topk must >= 1 or be a "
"Tensor, but received %d .",
k);
}
PADDLE_ENFORCE_GE(input_dims.size(), 1, PADDLE_ENFORCE_GE(input_dims.size(), 1,
"input of topk must have >= 1d shape"); "input of topk must have >= 1d shape");
......
...@@ -294,6 +294,7 @@ class ScopedTensorDescriptor { ...@@ -294,6 +294,7 @@ class ScopedTensorDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
}; };
#if CUDNN_VERSION >= 7201
class ScopedRNNTensorDescriptor { class ScopedRNNTensorDescriptor {
public: public:
ScopedRNNTensorDescriptor() { ScopedRNNTensorDescriptor() {
...@@ -337,6 +338,7 @@ class ScopedRNNTensorDescriptor { ...@@ -337,6 +338,7 @@ class ScopedRNNTensorDescriptor {
cudnnRNNDataDescriptor_t desc_; cudnnRNNDataDescriptor_t desc_;
DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
}; };
#endif
class ScopedDropoutDescriptor { class ScopedDropoutDescriptor {
public: public:
......
...@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); ...@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif #endif
......
...@@ -101,9 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -101,9 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \ __macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \ __macro(cudnnSetDropoutDescriptor); \
__macro(cudnnRestoreDropoutDescriptor); \ __macro(cudnnRestoreDropoutDescriptor); \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnCreateRNNDescriptor); \ __macro(cudnnCreateRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \ __macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \ __macro(cudnnGetRNNWorkspaceSize); \
...@@ -112,11 +109,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -112,11 +109,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnRNNBackwardData); \ __macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \ __macro(cudnnRNNForwardInference); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx); \
__macro(cudnnDestroyDropoutDescriptor); \ __macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \ __macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetTensorNdDescriptorEx); __macro(cudnnSetTensorNdDescriptorEx);
...@@ -188,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -188,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7201
#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7401 #if CUDNN_VERSION >= 7401
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
......
...@@ -521,3 +521,18 @@ DEFINE_int32( ...@@ -521,3 +521,18 @@ DEFINE_int32(
DEFINE_bool(sort_sum_gradient, false, DEFINE_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of " "Sum gradients by the reverse order of "
"the forward execution sequence."); "the forward execution sequence.");
/**
* Performance related FLAG
* Name: max_inplace_grad_add
* Since Version: 2.0.0
* Value Range: int32, default=0
* Example:
* Note: The maximum number of inplace grad_add.
*/
DEFINE_int32(
max_inplace_grad_add, 0,
"The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that "
"less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
"instead of sum. Default is 0.");
...@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator); ...@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
// others // others
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism); DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname); DECLARE_string(tracer_profile_fname);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// cudnn // cudnn
...@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads, FLAGS_use_mkldnn); FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
......
...@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"fake_quantize_dequantize_moving_average_abs_max", {"fake_quantize_dequantize_moving_average_abs_max",
{"Out", "OutScale", "OutAccum", "OutState"}}, {"Out", "OutScale", "OutAccum", "OutState"}},
{"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"check_finite_and_unscale", {"Out", "FoundInfinite"}}, {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
{"update_loss_scaling", {"update_loss_scaling",
{"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <map> #include <map>
...@@ -22,6 +23,7 @@ limitations under the License. */ ...@@ -22,6 +23,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
...@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
"enable_inplace", "enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; }, [](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"enable_addto",
[](const BuildStrategy &self) { return self.enable_addto_; },
[](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
.def_property( .def_property(
"fuse_all_reduce_ops", "fuse_all_reduce_ops",
[](const BuildStrategy &self) { [](const BuildStrategy &self) {
......
...@@ -121,6 +121,18 @@ function cmake_base() { ...@@ -121,6 +121,18 @@ function cmake_base() {
else else
exit 1 exit 1
fi fi
elif [ "$1" == "cp38-cp38" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
exit 1
fi
fi fi
# delete `gym` to avoid modifying requirements.txt in *.whl # delete `gym` to avoid modifying requirements.txt in *.whl
sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
...@@ -176,6 +188,13 @@ function cmake_base() { ...@@ -176,6 +188,13 @@ function cmake_base() {
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
elif [ "$1" == "cp38-cp38" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
fi fi
else else
pip install -r ${PADDLE_ROOT}/python/requirements.txt pip install -r ${PADDLE_ROOT}/python/requirements.txt
...@@ -514,6 +533,8 @@ EOF ...@@ -514,6 +533,8 @@ EOF
pip3.6 uninstall -y paddlepaddle pip3.6 uninstall -y paddlepaddle
elif [ "$1" == "cp37-cp37m" ]; then elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 uninstall -y paddlepaddle pip3.7 uninstall -y paddlepaddle
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 uninstall -y paddlepaddle
fi fi
set -ex set -ex
...@@ -527,6 +548,8 @@ EOF ...@@ -527,6 +548,8 @@ EOF
pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp37-cp37m" ]; then elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi fi
tmpfile_rand=`date +%s%N` tmpfile_rand=`date +%s%N`
tmpfile=$tmp_dir/$tmpfile_rand tmpfile=$tmp_dir/$tmpfile_rand
...@@ -666,7 +689,7 @@ function generate_api_spec() { ...@@ -666,7 +689,7 @@ function generate_api_spec() {
awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
# Use sed to make python2 and python3 sepc keeps the same # Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' $spec_path sed -i 's/arg0: str/arg0: unicode/g' $spec_path
sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
...@@ -1244,21 +1267,25 @@ EOF ...@@ -1244,21 +1267,25 @@ EOF
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
fi fi
#ref_paddle2_mv1="" #ref_paddle2_mv1=""
...@@ -1363,6 +1390,22 @@ EOF ...@@ -1363,6 +1390,22 @@ EOF
apt-get clean -y && \ apt-get clean -y && \
rm -f ${ref_paddle37} && \ rm -f ${ref_paddle37} && \
ldconfig ldconfig
EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
xz-utils tk-dev libffi-dev liblzma-dev
RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
apt-get clean -y && \
rm -f ${ref_paddle38} && \
ldconfig
EOF EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first # run paddle version to install python packages first
......
...@@ -42,6 +42,7 @@ server_num = fleet.server_num ...@@ -42,6 +42,7 @@ server_num = fleet.server_num
server_index = fleet.server_index server_index = fleet.server_index
server_endpoints = fleet.server_endpoints server_endpoints = fleet.server_endpoints
is_server = fleet.is_server is_server = fleet.is_server
set_util = fleet.set_util
util = fleet.util util = fleet.util
barrier_worker = fleet.barrier_worker barrier_worker = fleet.barrier_worker
init_worker = fleet.init_worker init_worker = fleet.init_worker
......
...@@ -180,6 +180,8 @@ class Fleet(object): ...@@ -180,6 +180,8 @@ class Fleet(object):
raise ValueError( raise ValueError(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}". "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
format(type(role_maker))) format(type(role_maker)))
self._role_maker._generate_role()
self.strategy_compiler = StrategyCompiler() self.strategy_compiler = StrategyCompiler()
if paddle.fluid.framework.in_dygraph_mode(): if paddle.fluid.framework.in_dygraph_mode():
if parallel_helper._is_parallel_ctx_initialized(): if parallel_helper._is_parallel_ctx_initialized():
...@@ -187,7 +189,6 @@ class Fleet(object): ...@@ -187,7 +189,6 @@ class Fleet(object):
"The dygraph parallel environment has been initialized.") "The dygraph parallel environment has been initialized.")
else: else:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
return None
def is_first_worker(self): def is_first_worker(self):
""" """
...@@ -206,7 +207,7 @@ class Fleet(object): ...@@ -206,7 +207,7 @@ class Fleet(object):
fleet.is_first_worker() fleet.is_first_worker()
""" """
return self._role_maker.is_first_worker() return self._role_maker._is_first_worker()
def worker_index(self): def worker_index(self):
""" """
...@@ -223,7 +224,7 @@ class Fleet(object): ...@@ -223,7 +224,7 @@ class Fleet(object):
fleet.worker_index() fleet.worker_index()
""" """
return self._role_maker.worker_index() return self._role_maker._worker_index()
def worker_num(self): def worker_num(self):
""" """
...@@ -240,7 +241,7 @@ class Fleet(object): ...@@ -240,7 +241,7 @@ class Fleet(object):
fleet.worker_num() fleet.worker_num()
""" """
return self._role_maker.worker_num() return self._role_maker._worker_num()
def is_worker(self): def is_worker(self):
""" """
...@@ -258,7 +259,7 @@ class Fleet(object): ...@@ -258,7 +259,7 @@ class Fleet(object):
fleet.is_worker() fleet.is_worker()
""" """
return self._role_maker.is_worker() return self._role_maker._is_worker()
def worker_endpoints(self, to_string=False): def worker_endpoints(self, to_string=False):
""" """
...@@ -275,13 +276,10 @@ class Fleet(object): ...@@ -275,13 +276,10 @@ class Fleet(object):
fleet.worker_endpoints() fleet.worker_endpoints()
""" """
'''
if to_string: if to_string:
return ",".join(self._role_maker.get_trainer_endpoints()) return ",".join(self._role_maker._get_trainer_endpoints())
else: else:
return self._role_maker.get_trainer_endpoints() return self._role_maker._get_trainer_endpoints()
'''
return ["127.0.0.1:1001", "127.0.0.1:1002"]
def server_num(self): def server_num(self):
""" """
...@@ -296,7 +294,7 @@ class Fleet(object): ...@@ -296,7 +294,7 @@ class Fleet(object):
fleet.init() fleet.init()
fleet.server_num() fleet.server_num()
""" """
return len(self._role_maker.get_pserver_endpoints()) return len(self._role_maker._get_pserver_endpoints())
def server_index(self): def server_index(self):
""" """
...@@ -313,7 +311,7 @@ class Fleet(object): ...@@ -313,7 +311,7 @@ class Fleet(object):
fleet.server_index() fleet.server_index()
""" """
return self._role_maker.server_index() return self._role_maker._server_index()
def server_endpoints(self, to_string=False): def server_endpoints(self, to_string=False):
""" """
...@@ -332,9 +330,9 @@ class Fleet(object): ...@@ -332,9 +330,9 @@ class Fleet(object):
""" """
if to_string: if to_string:
return ",".join(self._role_maker.get_pserver_endpoints()) return ",".join(self._role_maker._get_pserver_endpoints())
else: else:
return self._role_maker.get_pserver_endpoints() return self._role_maker._get_pserver_endpoints()
def is_server(self): def is_server(self):
""" """
...@@ -352,10 +350,12 @@ class Fleet(object): ...@@ -352,10 +350,12 @@ class Fleet(object):
fleet.is_server() fleet.is_server()
""" """
return self._role_maker.is_server( return self._role_maker._is_server(
) or self._role_maker._is_heter_worker() ) or self._role_maker._is_heter_worker()
@property def set_util(self, util):
self._util = util
def util(self): def util(self):
""" """
Utility functions that can be used under certain runtime Utility functions that can be used under certain runtime
...@@ -376,16 +376,6 @@ class Fleet(object): ...@@ -376,16 +376,6 @@ class Fleet(object):
""" """
return self._util return self._util
@util.setter
def util(self, util):
"""
Set Utility functions for userd-defined runtime
Returns:
None
"""
self._util = util
def barrier_worker(self): def barrier_worker(self):
""" """
barrier all workers barrier all workers
...@@ -393,7 +383,7 @@ class Fleet(object): ...@@ -393,7 +383,7 @@ class Fleet(object):
Returns: Returns:
None None
""" """
self._role_maker.barrier_worker() self._role_maker._barrier("worker")
@is_non_distributed_check @is_non_distributed_check
@inited_runtime_handler @inited_runtime_handler
......
...@@ -13,18 +13,332 @@ ...@@ -13,18 +13,332 @@
# limitations under the License. # limitations under the License.
"""Defination of Role Makers.""" """Defination of Role Makers."""
import os import os
import time
import numpy as np import numpy as np
import warnings import warnings
from multiprocessing import Process, Manager from multiprocessing import Process, Manager
import paddle.fluid as fluid
#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker'] import paddle.fluid as fluid
class Role: class Role:
WORKER = 1 WORKER = 1
SERVER = 2 SERVER = 2
HETER_WORKER = 3 HETER_WORKER = 3
ALL = 4
class Gloo(object):
"""
Gloo is a universal class for barrier and collective communication
"""
class RENDEZVOUS:
HDFS = 1
FILE = 2
HTTP = 3
def __init__(self):
self._worker_comm = None
self._server_comm = None
self._nodes_comm = None
self._comm_world = ["worker", "server", "all"]
self._err_init = "gloo is not initialized, will not communicator with other nodes"
self._err_type = "gloo initialized error, please check arguments"
self._err_world = "argument error, comm_world must in {}".format(
self._comm_world)
self._is_initialized = False
self._init_timeout_seconds = 3600
self._run_timeout_seconds = 9999999
self._rendezvous = None
self._role = None
self._iface = None
self._role_id = -1
self._worker_num = -1
self._server_num = -1
self._need_init_all = False
def init(self,
rendezvous,
role,
role_id,
worker_num,
server_num,
need_init_all=False,
kwargs=None):
self._rendezvous = rendezvous
self._role = role
self._role_id = role_id
self._worker_num = worker_num
self._server_num = server_num
self._need_init_all = need_init_all
self._iface = self.__get_default_iface()
self._prefix = kwargs.get("store.prefix", "")
if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
dfs_name = kwargs.get("dfs.name", "")
dfs_ugi = kwargs.get("dfs.ugi", "")
dfs_path = kwargs.get("dfs.path", "")
if not dfs_name or not dfs_ugi or not dfs_path:
raise ValueError(self._err_type)
self._init_dfs(dfs_name, dfs_ugi, dfs_path, self._prefix)
elif self._rendezvous == Gloo.RENDEZVOUS.FILE:
fs_path = kwargs.get("dfs.path", "")
if not fs_path:
raise ValueError(self._err_type)
self._init_fs(fs_path, self._prefix)
elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
ip = kwargs.get("http.host", "")
port = kwargs.get("http.port", "")
if not ip or not port:
raise ValueError(self._err_type)
self._init_http(ip, port, self._prefix)
else:
raise ValueError(self._err_type)
self._is_initialized = True
def _init_fs(self, fs_path, prefix):
def init(rank, nodes, role):
gloo = fluid.core.Gloo()
gloo.set_rank(rank)
gloo.set_size(nodes)
gloo.set_prefix(prefix)
gloo.set_iface(self._iface)
gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds)
gloo.set_hdfs_store(os.path.join(fs_path, role), "", "")
gloo.init()
return gloo
if self._role == Role.WORKER:
rank, nodes = self._get_rank_nodes(Role.WORKER)
gloo = init(rank, nodes, "WORKER")
self._worker_comm = gloo
else:
rank, nodes = self._get_rank_nodes(Role.SERVER)
gloo = init(rank, nodes, "SERVER")
self._server_comm = gloo
if self._need_init_all:
rank, nodes = self._get_rank_nodes(Role.ALL)
gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo
def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix):
def init(rank, nodes, role):
gloo = fluid.core.Gloo()
gloo.set_rank(rank)
gloo.set_size(nodes)
gloo.set_prefix(prefix)
gloo.set_iface(self._iface)
gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds)
gloo.set_hdfs_store(os.path.join(dfs_path, role), dfs_name, dfs_ugi)
gloo.init()
return gloo
if self._role == Role.WORKER:
rank, nodes = self._get_rank_nodes(Role.WORKER)
gloo = init(rank, nodes, "WORKER")
self._worker_comm = gloo
else:
rank, nodes = self._get_rank_nodes(Role.SERVER)
gloo = init(rank, nodes, "SERVER")
self._server_comm = gloo
if self._need_init_all:
rank, nodes = self._get_rank_nodes(Role.ALL)
gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo
def _init_http(self, ip, port, prefix):
def __start_kv_server(http_server_d, size_d):
from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(port, size_d)
http_server.start()
wait_seconds = 5
while http_server_d.get("running",
False) and not http_server.shoud_stop():
time.sleep(wait_seconds)
http_server.stop()
def init_kv_server():
size_d = {
"trainer": self._worker_num,
"pserver": self._server_num,
"all": self._worker_num + self._server_num
}
_http_server_d = {"running": True}
# child process for http server
_http_server = Process(
target=__start_kv_server, args=(_http_server_d, size_d))
_http_server.daemon = True
# set running status to True
# start child process
_http_server.start()
def init(rank, nodes, role):
gloo = fluid.core.Gloo()
gloo.set_rank(rank)
gloo.set_size(nodes)
gloo.set_prefix(prefix)
gloo.set_iface(self._iface)
gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds)
gloo.set_http_store(ip, port, role)
return gloo
port = int(port)
if self._role == Role.SERVER and self._role_id == 0:
init_kv_server()
if self._role == Role.WORKER:
rank, nodes = self._get_rank_nodes(Role.WORKER)
gloo = init(rank, nodes, "WORKER")
self._worker_comm = gloo
else:
rank, nodes = self._get_rank_nodes(Role.SERVER)
gloo = init(rank, nodes, "SERVER")
self._server_comm = gloo
if self._need_init_all:
rank, nodes = self._get_rank_nodes(Role.ALL)
gloo = init(rank, nodes, "ALL")
self._nodes_comm = gloo
def _get_rank_nodes(self, role):
nodes = 0
rank = -1
if role == Role.WORKER:
nodes = self._worker_num
rank = self._role_id
elif role == Role.SERVER:
nodes = self._server_num
rank = self._role_id
elif role == Role.ALL:
nodes = self._worker_num + self._server_num
if self._role == Role.WORKER:
rank = self._role_id
else:
rank = self._worker_num + self._role_id
else:
ValueError(self._err_type)
return rank, nodes
def __get_default_iface(self):
"""
get default physical interface
"""
default1 = self.__get_default_iface_from_gateway()
default2 = self.__get_default_iface_from_interfaces()
return default2 if default1 == "lo" else default1
def __get_default_iface_from_gateway(self):
"""
get default physical interface
"""
import netifaces
gateways = netifaces.gateways()
if gateways.get(netifaces.AF_INET) != None:
gateway = gateways[netifaces.AF_INET]
if len(gateway) > 0 and len(gateway[0]) > 1:
return gateway[0][1]
return "lo"
def __get_default_iface_from_interfaces(self):
"""
get default physical interface
"""
import netifaces
for intf_name in netifaces.interfaces():
addresses = netifaces.ifaddresses(intf_name)
if netifaces.AF_INET in addresses:
ipv4_addresses = addresses[netifaces.AF_INET]
for ipv4_address in ipv4_addresses:
if 'broadcast' in ipv4_address:
return intf_name
return "lo"
def barrier(self, comm_world):
"""
dummy barrier, do nothing
"""
if not self._is_initialized:
warnings.warn(self._err_init)
return
if comm_world not in self._comm_world:
raise ValueError(self._err_world)
if comm_world == "worker":
self._worker_comm.barrier()
elif comm_world == "server":
self._server_comm.barrier()
else:
self._nodes_comm.barrier()
def all_reduce(self, input, mode="sum", comm_world="worker"):
if not self._is_initialized:
warnings.warn(self._err_init)
return input
if comm_world not in self._comm_world:
raise ValueError(self._err_world)
input = np.array(input)
input_shape = input.shape
input_list = input.reshape(-1).tolist()
self.barrier(comm_world)
if comm_world == "worker":
ans = self._worker_comm.all_reduce(input_list, mode)
elif comm_world == "server":
ans = self._server_comm.all_reduce(input_list, mode)
else:
ans = self._nodes_comm.all_reduce(input_list, mode)
output = np.array(ans).reshape(input_shape)
return output
def all_gather(self, input, comm_world="worker"):
"""
dummy all gather, do nothing
Args:
obj(any): obj to do all gather
"""
if not self._is_initialized:
warnings.warn(self._err_init)
return input
if comm_world not in self._comm_world:
raise ValueError(self._err_world)
if comm_world == "worker":
output = self._worker_comm.all_gather(input)
elif comm_world == "server":
output = self._server_comm.all_gather(input)
else:
output = self._nodes_comm.all_gather(input)
return output
class RoleMakerBase(object): class RoleMakerBase(object):
...@@ -47,23 +361,19 @@ class RoleMakerBase(object): ...@@ -47,23 +361,19 @@ class RoleMakerBase(object):
self._heter_trainer_device = "CPU" self._heter_trainer_device = "CPU"
self._is_heter_parameter_server_mode = False self._is_heter_parameter_server_mode = False
self._node_type = None def _is_worker(self):
self._node_type_comm = None
self._all_comm = None
def is_worker(self):
""" """
return is_worker() of current process return is_worker() of current process
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def is_server(self): def _is_server(self):
""" """
return is_server() of current process return is_server() of current process
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def is_first_worker(self): def _is_first_worker(self):
""" """
Check whether the node is the first instance of worker. Check whether the node is the first instance of worker.
Returns: Returns:
...@@ -72,7 +382,7 @@ class RoleMakerBase(object): ...@@ -72,7 +382,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def worker_num(self): def _worker_num(self):
""" """
Get current total worker number. Get current total worker number.
...@@ -81,7 +391,7 @@ class RoleMakerBase(object): ...@@ -81,7 +391,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def server_num(self): def _server_num(self):
""" """
Get current total server number. Get current total server number.
...@@ -90,7 +400,7 @@ class RoleMakerBase(object): ...@@ -90,7 +400,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def worker_index(self): def _worker_index(self):
""" """
Get current worker id. Get current worker id.
...@@ -99,7 +409,7 @@ class RoleMakerBase(object): ...@@ -99,7 +409,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def server_index(self): def _server_index(self):
""" """
Get current server id. Get current server id.
...@@ -108,7 +418,7 @@ class RoleMakerBase(object): ...@@ -108,7 +418,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def role_id(self): def _role_id(self):
""" """
Get current id. Get current id.
...@@ -117,7 +427,7 @@ class RoleMakerBase(object): ...@@ -117,7 +427,7 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def node_num(self): def _node_num(self):
""" """
Get the training node number Get the training node number
Returns: Returns:
...@@ -125,13 +435,13 @@ class RoleMakerBase(object): ...@@ -125,13 +435,13 @@ class RoleMakerBase(object):
""" """
raise NotImplementedError("Please implement this method in child class") raise NotImplementedError("Please implement this method in child class")
def get_trainer_endpoints(self): def _get_trainer_endpoints(self):
""" """
return trainer endpoints return trainer endpoints
""" """
return self._worker_endpoints return self._worker_endpoints
def get_pserver_endpoints(self): def _get_pserver_endpoints(self):
""" """
return pserver endpoints return pserver endpoints
""" """
...@@ -142,19 +452,11 @@ class RoleMakerBase(object): ...@@ -142,19 +452,11 @@ class RoleMakerBase(object):
self._role, self._current_id, self._worker_endpoints, self._role, self._current_id, self._worker_endpoints,
self._server_endpoints) self._server_endpoints)
def _all_gather(self, comm_world, input): def _all_gather(self, input, comm_world="worker"):
""" print("warning: RoleMakerBase does not have all gather worker.")
Args:
input(int|float): input value
Returns:
return a list of values
"""
print("warning: RoleMakerBase does not have all gather.")
return None return None
def _all_reduce(self, comm_world, input, mode="sum"): def _all_reduce(self, input, mode="sum", comm_world="worker"):
""" """
Args: Args:
input(list/numpy.array): array of one dim input(list/numpy.array): array of one dim
...@@ -221,158 +523,112 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -221,158 +523,112 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def __init__(self, is_collective=False, **kwargs): def __init__(self, is_collective=False, **kwargs):
super(PaddleCloudRoleMaker, self).__init__() super(PaddleCloudRoleMaker, self).__init__()
self._is_collective = is_collective self._is_collective = is_collective
self._init_gloo = False # default no init gloo
self._kwargs = kwargs
self._non_distributed = False
self._kwargs = kwargs
self._role_is_generated = False self._role_is_generated = False
self._server_endpoints = None self._server_endpoints = None
self._worker_endpoints = None self._worker_endpoints = None
self._node_type_comm = None self._gloo = Gloo() # gloo instance
self._all_comm = None
self._non_distributed = False
if not self._is_collective:
self._hdfs_name = kwargs.get("hdfs_name", "")
self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
self._hdfs_path = kwargs.get("path", "").rstrip("/")
self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
3600)
self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
9999999)
ip_port = kwargs.get("http_ip_port", "")
self._http_ip_port = []
self._http_server = None
# if ip_port is not empty, it will use http instead of hdfs
if ip_port != "":
self._http_ip_port = ip_port.split(":")
# it's for communication between processes
self._manager = Manager()
# global dict to store status
self._http_server_d = self._manager.dict()
# set running status of http server
self._http_server_d["running"] = False
self._iface = self.__get_default_iface()
# this environment variable can be empty
self._prefix = os.getenv("SYS_JOB_ID", "")
def _barrier(self, comm_world): def _barrier(self, comm_world):
if isinstance(comm_world, fluid.core.Gloo): self._gloo.barrier(comm_world)
comm_world.barrier()
else:
print("warning: must init Gloo before using _barrier() function")
def _all_gather(self, comm_world, input): def _all_gather(self, input, comm_world="worker"):
if isinstance(comm_world, fluid.core.Gloo): return self._gloo.all_gather(input, comm_world)
self._barrier(comm_world)
output = comm_world.all_gather(input)
return output
else:
print("warning: must init Gloo before using _all_gather() function")
return None
def _all_reduce(self, comm_world, input, mode="sum"): def _all_reduce(self, input, mode="sum", comm_world="worker"):
if isinstance(comm_world, fluid.core.Gloo): return self._gloo.all_reduce(input, mode, comm_world)
input = np.array(input) def _is_worker(self):
input_shape = input.shape
input_list = input.reshape(-1).tolist()
self._barrier(comm_world)
ans = comm_world.all_reduce(input_list, mode)
output = np.array(ans).reshape(input_shape)
return output
else:
print("warning: must init Gloo before using _all_reduce() function")
return None
def is_worker(self):
""" """
whether current process is worker whether current process is worker
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._role == Role.WORKER return self._role == Role.WORKER
def is_server(self): def _is_server(self):
""" """
whether current process is server whether current process is server
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._role == Role.SERVER return self._role == Role.SERVER
def is_first_worker(self): def _is_first_worker(self):
""" """
whether current process is worker of rank 0 whether current process is worker of rank 0
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._role == Role.WORKER and self._current_id == 0 return self._role == Role.WORKER and self._current_id == 0
def worker_index(self): def _worker_index(self):
""" """
get index of current worker get index of current worker
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._current_id return self._current_id
def server_index(self): def _server_index(self):
""" """
get index of current server get index of current server
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._current_id return self._current_id
def role_id(self): def _role_id(self):
""" """
get index of current node get index of current node
""" """
if not self._role_is_generated:
self._generate_role()
return self._current_id return self._current_id
def worker_num(self): def _worker_num(self):
""" """
retrun the current number of worker retrun the current number of worker
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._trainers_num return self._trainers_num
def server_num(self): def _server_num(self):
""" """
return the current number of server return the current number of server
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._trainers_num return len(self._get_pserver_endpoints())
def node_num(self): def _node_num(self):
""" """
return the training node number return the training node number
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._node_num return self._nodes_num
def get_trainer_endpoints(self): def _get_trainer_endpoints(self):
""" """
get endpoint of all trainers get endpoint of all trainers
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._worker_endpoints return self._worker_endpoints
def get_pserver_endpoints(self): def _get_pserver_endpoints(self):
""" """
get endpoint of all pservers get endpoint of all pservers
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._server_endpoints return self._server_endpoints
def _is_non_distributed(self): def _is_non_distributed(self):
...@@ -381,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -381,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
(use python-run to launch fleet-code directly) (use python-run to launch fleet-code directly)
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._non_distributed return self._non_distributed
def _heter_worker_num(self): def _heter_worker_num(self):
...@@ -389,7 +645,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -389,7 +645,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
get heter worker nums get heter worker nums
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._heter_trainers_num return self._heter_trainers_num
def _is_heter_worker(self): def _is_heter_worker(self):
...@@ -397,45 +653,35 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -397,45 +653,35 @@ class PaddleCloudRoleMaker(RoleMakerBase):
whether current process is heter worker whether current process is heter worker
""" """
if not self._role_is_generated: if not self._role_is_generated:
self.generate_role() self._generate_role()
return self._role == Role.HETER_WORKER return self._role == Role.HETER_WORKER
def _get_rank(self):
"""
get current rank in all workers and pservers
"""
if not self._role_is_generated:
self.generate_role()
return self._rank
def _get_size(self):
"""
get total num of all workers and pservers
"""
if not self._role_is_generated:
self.generate_role()
return self._size
def _ps_env(self): def _ps_env(self):
try: try:
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002 # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST") self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
if self._server_endpoints is None: if self._server_endpoints is None:
# back to non_distributed execution. # back to non_distributed execution.
self._server_endpoints = "" self._server_endpoints = ""
self._trainers_num = 1 self._trainers_num = 1
self._role = Role.WORKER self._role = Role.WORKER
self._current_id = 0 self._current_id = 0
self._node_num = 1 self._nodes_num = 1
self._heter_trainers_num = 0 self._heter_trainers_num = 0
self._heter_trainer_endpoints = None self._heter_trainer_endpoints = None
self._non_distributed = True self._non_distributed = True
return return
self._server_endpoints = self._server_endpoints.split(",") self._server_endpoints = self._server_endpoints.split(",")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
if self._worker_endpoints:
self._worker_endpoints = self._worker_endpoints.split(",")
else:
self._worker_endpoints = []
trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"]) trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
training_role = os.environ["TRAINING_ROLE"] training_role = os.environ["TRAINING_ROLE"]
...@@ -497,7 +743,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -497,7 +743,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self._trainers_num = trainers_num self._trainers_num = trainers_num
self._role = role self._role = role
self._current_id = current_id self._current_id = current_id
self._node_num = len( self._nodes_num = len(
set([x.split(':')[0] for x in self._worker_endpoints])) set([x.split(':')[0] for x in self._worker_endpoints]))
self._heter_trainers_num = heter_trainers_num self._heter_trainers_num = heter_trainers_num
self._heter_trainer_endpoints = heter_trainer_eplist self._heter_trainer_endpoints = heter_trainer_eplist
...@@ -506,6 +752,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -506,6 +752,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
assert (self._training_role == "TRAINER") assert (self._training_role == "TRAINER")
self._role = Role.WORKER
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
if self._worker_endpoints is None: if self._worker_endpoints is None:
...@@ -515,136 +762,79 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -515,136 +762,79 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self._non_distributed = True self._non_distributed = True
self._worker_endpoints = self._worker_endpoints.split(",") self._worker_endpoints = self._worker_endpoints.split(",")
self._trainers_num = len(self._worker_endpoints) self._trainers_num = len(self._worker_endpoints)
self._node_num = len( self._nodes_num = len(
set([x.split(':')[0] for x in self._worker_endpoints])) set([x.split(':')[0] for x in self._worker_endpoints]))
def _init_gloo_env(self): def _gloo_init(self):
def init_gloo_instance(role="trainer"): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
role = role.lower() use_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
assert role in ["trainer", "pserver", "all"] if use_gloo not in [1, 2]:
if role == "trainer": return
all_list = self._worker_endpoints
rank = self._current_id # PADDLE_GLOO_RENDEZVOUS 1: HDFS 2: FILE 3: HTTP
elif role == "pserver": rendezvous_type = int(os.getenv("PADDLE_GLOO_RENDEZVOUS", "0"))
all_list = self._server_endpoints prefix = os.getenv("SYS_JOB_ID", "")
rank = self._current_id if rendezvous_type not in [
else: Gloo.RENDEZVOUS.HDFS, Gloo.RENDEZVOUS.HTTP, Gloo.RENDEZVOUS.FILE
all_list = self._worker_endpoints + self._server_endpoints ]:
rank = all_list.index(self._cur_endpoint) raise ValueError(self._gloo._err_type)
gloo = fluid.core.Gloo()
gloo.set_rank(rank) need_init_all = True if use_gloo == 2 else False
gloo.set_size(len(all_list))
gloo.set_prefix(self._prefix) if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
gloo.set_iface(self._iface) dfs_name = os.getenv("PADDLE_GLOO_FS_NAME", "")
gloo.set_timeout_seconds(self._init_timeout_seconds, dfs_ugi = os.getenv("PADDLE_GLOO_FS_UGI", "")
self._run_timeout_seconds) dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
if len(self._http_ip_port) != 0: kwargs = {
gloo.set_http_store(self._http_ip_port[0], "dfs.name": dfs_name,
int(self._http_ip_port[1]), role) "dfs.ugi": dfs_ugi,
else: "dfs.path": dfs_path,
gloo.set_hdfs_store(self._hdfs_path + "/" + role, "store.prefix": prefix,
self._hdfs_name, self._hdfs_ugi) }
gloo.init() elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
return gloo ip = os.getenv("PADDLE_GLOO_HTTP_HOST", "")
port = os.getenv("PADDLE_GLOO_HTTP_PORT", "")
# paddlecloud support gloo kwargs = {
if self._role == Role.WORKER: "http.host": ip,
if self._current_id == 0 and len(self._http_ip_port) != 0: "http.port": port,
size_d = { "store.prefix": prefix,
"trainer": len(self._worker_endpoints), }
"pserver": len(self._server_endpoints),
"all":
len(self._worker_endpoints) + len(self._server_endpoints)
}
# child process for http server
self._http_server = Process(
target=self.__start_kv_server,
args=(self._http_server_d, size_d))
self._http_server.daemon = True
# set running status to True
self._http_server_d["running"] = True
# start child process
self._http_server.start()
self._node_type = 1
gloo = init_gloo_instance("trainer")
self._node_type_comm = gloo
else: else:
assert self._role == Role.SERVER dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
self._node_type = 0 kwargs = {
gloo = init_gloo_instance("pserver") "dfs.path": dfs_path,
self._node_type_comm = gloo "store.prefix": prefix,
}
all_list = self._worker_endpoints + self._server_endpoints
self._rank = all_list.index(self._cur_endpoint) if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
self._size = len(all_list) type = "HDFS"
elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
gloo = init_gloo_instance("all") type = "HTTP"
self._all_comm = gloo else:
type = "FILE"
print("Gloo init with {}: need_init_all: {}, args: {}".format(
type, need_init_all, kwargs))
if self._http_server is not None: self._gloo.init(
# set running status to False rendezvous=rendezvous_type,
self._http_server_d["running"] = False role=self._role,
# wait until child process exits role_id=self._role_id(),
self._http_server.join() worker_num=self._worker_num(),
server_num=self._server_num(),
need_init_all=need_init_all,
kwargs=kwargs)
def generate_role(self): def _generate_role(self):
""" """
generate role for role maker generate role for role maker
""" """
if not self._role_is_generated: if not self._role_is_generated:
if not self._is_collective: if not self._is_collective:
self._ps_env() self._ps_env()
if "PADDLE_WITH_GLOO" in os.environ:
self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
if self._init_gloo:
self._init_gloo_env()
else: else:
self._collective_env() self._collective_env()
self._role_is_generated = True self._role_is_generated = True
self._gloo_init()
def __get_default_iface(self):
"""
get default physical interface
"""
default1 = self.__get_default_iface_from_gateway()
default2 = self.__get_default_iface_from_interfaces()
return default2 if default1 == "lo" else default1
def __get_default_iface_from_gateway(self):
"""
get default physical interface
"""
import netifaces
gateways = netifaces.gateways()
if gateways.get(netifaces.AF_INET) != None:
gateway = gateways[netifaces.AF_INET]
if len(gateway) > 0 and len(gateway[0]) > 1:
return gateway[0][1]
return "lo"
def __get_default_iface_from_interfaces(self):
"""
get default physical interface
"""
import netifaces
for intf_name in netifaces.interfaces():
addresses = netifaces.ifaddresses(intf_name)
if netifaces.AF_INET in addresses:
ipv4_addresses = addresses[netifaces.AF_INET]
for ipv4_address in ipv4_addresses:
if 'broadcast' in ipv4_address:
return intf_name
return "lo"
def __start_kv_server(self, http_server_d, size_d):
from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(int(self._http_ip_port[1]), size_d)
http_server.start()
wait_seconds = 5
while http_server_d.get("running",
False) and not http_server.shoud_stop():
time.sleep(wait_seconds)
http_server.stop()
class UserDefinedRoleMaker(PaddleCloudRoleMaker): class UserDefinedRoleMaker(PaddleCloudRoleMaker):
...@@ -670,26 +860,24 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker): ...@@ -670,26 +860,24 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
self._cur_endpoint = self._worker_endpoints[self._current_id] self._cur_endpoint = self._worker_endpoints[self._current_id]
elif self._role == Role.SERVER: elif self._role == Role.SERVER:
self._cur_endpoint = self._server_endpoints[self._current_id] self._cur_endpoint = self._server_endpoints[self._current_id]
self._node_num = len( self._nodes_num = len(
set([x.split(':')[0] for x in self._worker_endpoints])) set([x.split(':')[0] for x in self._worker_endpoints]))
def _user_defined_collective_env(self): def _user_defined_collective_env(self):
self._worker_endpoints = self._kwargs.get("worker_endpoints") self._worker_endpoints = self._kwargs.get("worker_endpoints")
self._current_id = self._kwargs.get("current_id") self._current_id = self._kwargs.get("current_id")
self._trainers_num = len(self._worker_endpoints) self._trainers_num = len(self._worker_endpoints)
self._training_role = Role.Worker self._training_role = Role.WORKER
self._node_num = len( self._nodes_num = len(
set([x.split(':')[0] for x in self._worker_endpoints])) set([x.split(':')[0] for x in self._worker_endpoints]))
def generate_role(self): def _generate_role(self):
""" """
generate role for role maker generate role for role maker
""" """
if not self._role_is_generated: if not self._role_is_generated:
if not self._is_collective: if not self._is_collective:
self._user_defined_ps_env() self._user_defined_ps_env()
if self._init_gloo:
self._init_gloo_env()
else: else:
self._user_defined_collective_env() self._user_defined_collective_env()
self._role_is_generated = True self._role_is_generated = True
...@@ -57,34 +57,7 @@ class UtilBase(object): ...@@ -57,34 +57,7 @@ class UtilBase(object):
), "fs_client must be the instance of paddle.distributed.fleet.utils.FS" ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
self.fs_client = fs_client self.fs_client = fs_client
def __check_comm_world(self, comm_world="worker"): def all_reduce(self, input, mode="sum", comm_world="worker"):
if not self.role_maker._role_is_generated:
self.role_maker.generate_role()
_comm_world = None
comm_world_upper = comm_world.upper()
if comm_world_upper == "WORKER":
if not self.role_maker.is_worker():
print(
"warning: current role is not worker in collective_func(comm_world=\"worker\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "SERVER":
if not self.role_maker.is_server():
print(
"warning: current role is not server in collective_func(comm_world=\"server\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "ALL":
_comm_world = self.role_maker._all_comm
else:
raise ValueError(
"not support comm_world, please choose one from [worker, server, all]"
)
return _comm_world
def all_reduce(self, input, mode, comm_world="worker"):
""" """
All reduce `input` between specified collection. This is a distributed API. All reduce `input` between specified collection. This is a distributed API.
...@@ -130,8 +103,7 @@ class UtilBase(object): ...@@ -130,8 +103,7 @@ class UtilBase(object):
if __name__ == "__main__": if __name__ == "__main__":
train() train()
""" """
_comm_world = self.__check_comm_world(comm_world) return self.role_maker._all_reduce(input, mode, comm_world)
return self.role_maker._all_reduce(_comm_world, input, mode)
def barrier(self, comm_world="worker"): def barrier(self, comm_world="worker"):
""" """
...@@ -170,8 +142,7 @@ class UtilBase(object): ...@@ -170,8 +142,7 @@ class UtilBase(object):
if __name__ == "__main__": if __name__ == "__main__":
train() train()
""" """
_comm_world = self.__check_comm_world(comm_world) self.role_maker._barrier(comm_world)
self.role_maker._barrier(_comm_world)
def all_gather(self, input, comm_world="worker"): def all_gather(self, input, comm_world="worker"):
""" """
...@@ -219,8 +190,8 @@ class UtilBase(object): ...@@ -219,8 +190,8 @@ class UtilBase(object):
if __name__ == "__main__": if __name__ == "__main__":
train() train()
""" """
_comm_world = self.__check_comm_world(comm_world)
return self.role_maker._all_gather(_comm_world, input) return self.role_maker._all_gather(input, comm_world)
def _broadcast(self): def _broadcast(self):
pass pass
...@@ -266,8 +237,8 @@ class UtilBase(object): ...@@ -266,8 +237,8 @@ class UtilBase(object):
if not isinstance(files, list): if not isinstance(files, list):
raise TypeError("files should be a list of file need to be read.") raise TypeError("files should be a list of file need to be read.")
trainer_id = self.role_maker.worker_index() trainer_id = self.role_maker._worker_index()
trainers = self.role_maker.worker_num() trainers = self.role_maker._worker_num()
remainder = len(files) % trainers remainder = len(files) % trainers
blocksize = int(len(files) / trainers) blocksize = int(len(files) / trainers)
...@@ -309,7 +280,7 @@ class UtilBase(object): ...@@ -309,7 +280,7 @@ class UtilBase(object):
fleet_util._set_role_maker(role) fleet_util._set_role_maker(role)
fleet_util.print_on_rank("I'm worker 0", 0) fleet_util.print_on_rank("I'm worker 0", 0)
""" """
if self.role_maker.worker_index() != rank_id: if self.role_maker._worker_index() != rank_id:
return return
print(message) print(message)
......
...@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine. ...@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
""" """
from __future__ import print_function from __future__ import print_function
import shutil
import sys import sys
import tempfile
from sys import version from sys import version
import subprocess import subprocess
import os import os
...@@ -213,12 +216,20 @@ def launch_collective(args): ...@@ -213,12 +216,20 @@ def launch_collective(args):
cluster, pod = get_cluster_from_args(args, gpus) cluster, pod = get_cluster_from_args(args, gpus)
logger.debug("get cluster from args:{}".format(cluster)) logger.debug("get cluster from args:{}".format(cluster))
global_envs = copy.copy(os.environ.copy())
gloo_rendezvous_dir = tempfile.mkdtemp()
# add gloo env
global_envs["PADDLE_WITH_GLOO"] = "1"
global_envs["PADDLE_GLOO_RENDEZVOUS"] = "2"
global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
procs = start_local_trainers( procs = start_local_trainers(
cluster, cluster,
pod, pod,
training_script=args.training_script, training_script=args.training_script,
training_script_args=args.training_script_args, training_script_args=args.training_script_args,
log_dir=args.log_dir) log_dir=args.log_dir,
envs=global_envs)
while True: while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks()) alive = watch_local_trainers(procs, cluster.trainers_nranks())
...@@ -230,6 +241,9 @@ def launch_collective(args): ...@@ -230,6 +241,9 @@ def launch_collective(args):
time.sleep(3) time.sleep(3)
if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir)
def launch_ps(args): def launch_ps(args):
ports = None ports = None
...@@ -315,6 +329,13 @@ def launch_ps(args): ...@@ -315,6 +329,13 @@ def launch_ps(args):
default_env = os.environ.copy() default_env = os.environ.copy()
current_env = copy.copy(default_env) current_env = copy.copy(default_env)
gloo_rendezvous_dir = tempfile.mkdtemp()
# add gloo env
current_env["PADDLE_WITH_GLOO"] = "1"
current_env["PADDLE_GLOO_RENDEZVOUS"] = "2"
current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
current_env.pop("http_proxy", None) current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None) current_env.pop("https_proxy", None)
procs = [] procs = []
...@@ -419,6 +440,9 @@ def launch_ps(args): ...@@ -419,6 +440,9 @@ def launch_ps(args):
procs[i].proc.terminate() procs[i].proc.terminate()
print("all parameter server are killed", file=sys.stderr) print("all parameter server are killed", file=sys.stderr)
if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir)
def launch(): def launch():
args = _parse_args() args = _parse_args()
......
...@@ -398,8 +398,14 @@ def start_local_trainers(cluster, ...@@ -398,8 +398,14 @@ def start_local_trainers(cluster,
pod, pod,
training_script, training_script,
training_script_args, training_script_args,
log_dir=None): log_dir=None,
current_env = copy.copy(os.environ.copy()) envs=None):
if envs is None:
current_env = copy.copy(os.environ.copy())
else:
current_env = copy.copy(envs)
#paddle broadcast ncclUniqueId use socket, and #paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them. #proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri" #if we set them to "", grpc will log error message "bad uri"
......
...@@ -57,12 +57,12 @@ class CollectiveHelper(object): ...@@ -57,12 +57,12 @@ class CollectiveHelper(object):
if startup_program is None: if startup_program is None:
self.startup_program = fluid.default_startup_program() self.startup_program = fluid.default_startup_program()
endpoints = self.role_maker.get_trainer_endpoints() endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()] current_endpoint = endpoints[self.role_maker._worker_index()]
for ring_id in range(self.nrings): for ring_id in range(self.nrings):
self._init_communicator( self._init_communicator(
self.startup_program, current_endpoint, endpoints, self.startup_program, current_endpoint, endpoints,
self.role_maker.worker_index(), ring_id, self.wait_port) self.role_maker._worker_index(), ring_id, self.wait_port)
self._broadcast_params() self._broadcast_params()
def _init_communicator(self, program, current_endpoint, endpoints, rank, def _init_communicator(self, program, current_endpoint, endpoints, rank,
......
...@@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase): ...@@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase):
sparsity=configs['sparsity'], sparsity=configs['sparsity'],
parameter_list=opt._parameter_list, parameter_list=opt._parameter_list,
use_nesterov=opt._use_nesterov, use_nesterov=opt._use_nesterov,
num_trainers=self.role_maker.worker_num(), num_trainers=self.role_maker._worker_num(),
regularization=opt.regularization, regularization=opt.regularization,
grad_clip=opt._grad_clip, grad_clip=opt._grad_clip,
name=opt._name) name=opt._name)
...@@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase): ...@@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase):
if not isinstance(self.inner_opt, Momentum): if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer") logging.warn("dgc only works on Momentum optimizer")
return False return False
if self.role_maker.worker_num() <= 1: if self.role_maker._worker_num() <= 1:
logging.warn("dgc only works on multi cards") logging.warn("dgc only works on multi cards")
return False return False
......
...@@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# should fix the variable # should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy): def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints() trainer_endpoints = self.role_maker._get_trainer_endpoints()
trainers = trainer_endpoints trainers = trainer_endpoints
trainer_id = self.role_maker.worker_index() trainer_id = self.role_maker._worker_index()
current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id] current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
trainer_endpoints_env = ",".join(trainer_endpoints) trainer_endpoints_env = ",".join(trainer_endpoints)
trainers_num = self.role_maker.worker_num() trainers_num = self.role_maker._worker_num()
nccl_id_var = startup_program.global_block().create_var( nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
for i in range(1, build_strategy.nccl_comm_num): for i in range(1, build_strategy.nccl_comm_num):
...@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
local_build_strategy.enable_sequential_execution = True local_build_strategy.enable_sequential_execution = True
exe_strategy = self.user_defined_strategy.execution_strategy exe_strategy = self.user_defined_strategy.execution_strategy
worker_num = self.role_maker.worker_num() worker_num = self.role_maker._worker_num()
node_num = self.role_maker.node_num() node_num = self.role_maker._node_num()
if self.role_maker._is_collective: if self.role_maker._is_collective:
assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
...@@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# TODO(guru4elephant): should be an independent optimizer # TODO(guru4elephant): should be an independent optimizer
self._setup_nccl_op(startup_program, main_program, local_build_strategy) self._setup_nccl_op(startup_program, main_program, local_build_strategy)
local_build_strategy.num_trainers = self.role_maker.worker_num() local_build_strategy.num_trainers = self.role_maker._worker_num()
local_build_strategy.trainer_id = self.role_maker.worker_index() local_build_strategy.trainer_id = self.role_maker._worker_index()
local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints( local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
) )
local_build_strategy.enable_backward_optimizer_op_deps = True local_build_strategy.enable_backward_optimizer_op_deps = True
......
...@@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.localsgd: if not self.user_defined_strategy.localsgd:
return False return False
if self.role_maker.worker_num() <= 1: if self.role_maker._worker_num() <= 1:
return False return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \ return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
...@@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]}, inputs={'X': [param]},
outputs={'Out': [param]}, outputs={'Out': [param]},
attrs={ attrs={
'scale': 1.0 / self.role_maker.worker_num(), 'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize OP_ROLE_KEY: OpRole.Optimize
}) })
sub_block.append_op( sub_block.append_op(
...@@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): ...@@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.adaptive_localsgd: if not self.user_defined_strategy.adaptive_localsgd:
return False return False
if self.role_maker.worker_num() <= 1: if self.role_maker._worker_num() <= 1:
return False return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \ return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
...@@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): ...@@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [avg_loss]}, inputs={'X': [avg_loss]},
outputs={'Out': [avg_loss]}, outputs={'Out': [avg_loss]},
attrs={ attrs={
'scale': 1.0 / self.role_maker.worker_num(), 'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize OP_ROLE_KEY: OpRole.Optimize
}) })
...@@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): ...@@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]}, inputs={'X': [param]},
outputs={'Out': [param]}, outputs={'Out': [param]},
attrs={ attrs={
'scale': 1.0 / self.role_maker.worker_num(), 'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize OP_ROLE_KEY: OpRole.Optimize
}) })
sub_block.append_op( sub_block.append_op(
......
...@@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer): ...@@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
if k_steps < 0: if k_steps < 0:
return False return False
if self.role_maker.is_server(): if self.role_maker._is_server():
return False return False
if self.role_maker._is_heter_parameter_server_mode: if self.role_maker._is_heter_parameter_server_mode:
......
...@@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase):
strategy, self.role_maker) strategy, self.role_maker)
compiled_config.strategy = strategy compiled_config.strategy = strategy
if self.role_maker.is_worker() or self.role_maker._is_heter_worker(): if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
main_program, startup_program = self._build_trainer_programs( main_program, startup_program = self._build_trainer_programs(
compiled_config) compiled_config)
elif self.role_maker.is_server(): elif self.role_maker._is_server():
main_program, startup_program = self._build_pserver_programs( main_program, startup_program = self._build_pserver_programs(
compiled_config) compiled_config)
......
...@@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
optimize_ops, params_grads, prog_list = \ optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program, self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set) parameter_list, no_grad_set)
if self.role_maker.worker_num() == 1: if self.role_maker._worker_num() == 1:
return optimize_ops, params_grads return optimize_ops, params_grads
endpoints = self.role_maker.get_trainer_endpoints() endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()] current_endpoint = endpoints[self.role_maker._worker_index()]
self.startup_program = startup_program self.startup_program = startup_program
if startup_program is None: if startup_program is None:
self.startup_program = fluid.default_startup_program() self.startup_program = fluid.default_startup_program()
...@@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
self.nranks = nranks self.nranks = nranks
self.nrings = len(self.main_program_list) self.nrings = len(self.main_program_list)
self.rank = self.role_maker.worker_index() self.rank = self.role_maker._worker_index()
self.endpoints = endpoints self.endpoints = endpoints
self.current_endpoint = current_endpoint self.current_endpoint = current_endpoint
......
...@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
def _init_worker(self): def _init_worker(self):
def sync_strategy_envs(): def sync_strategy_envs():
kwargs = {} kwargs = {}
kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints( kwargs[
) "pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
kwargs["trainer_id"] = self.role_maker.worker_index() kwargs["trainer_id"] = self.role_maker._worker_index()
return kwargs return kwargs
def geo_strategy_envs(): def geo_strategy_envs():
...@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
return "#".join(init_attrs) return "#".join(init_attrs)
kwargs = {} kwargs = {}
kwargs["trainers"] = self.role_maker.worker_num() kwargs["trainers"] = self.role_maker._worker_num()
kwargs["sparse_attrs"] = get_sparse_attrs() kwargs["sparse_attrs"] = get_sparse_attrs()
return kwargs return kwargs
...@@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op( block.append_op(
type='recv_save', type='recv_save',
attrs={ attrs={
"trainer_id": self.role_maker.worker_index(), "trainer_id": self.role_maker._worker_index(),
"shape": var.shape, "shape": var.shape,
"slice_shapes": "slice_shapes":
[",".join([str(i) for i in var.shape])], [",".join([str(i) for i in var.shape])],
...@@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op( block.append_op(
type='recv_save', type='recv_save',
attrs={ attrs={
"trainer_id": self.role_maker.worker_index(), "trainer_id": self.role_maker._worker_index(),
"shape": var.shape, "shape": var.shape,
"slice_shapes": slice_shapes, "slice_shapes": slice_shapes,
"slice_varnames": var_ctx.split_varnames(), "slice_varnames": var_ctx.split_varnames(),
"remote_varnames": var_ctx.split_varnames(), "remote_varnames": var_ctx.split_varnames(),
"is_sparse": True, "is_sparse": True,
"endpoints": var_ctx.split_endpoints(), "endpoints": var_ctx.split_endpoints(),
"pserver_num": len(self.role_maker.get_pserver_endpoints()), "pserver_num":
len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name) "file_path": os.path.join(dirname, var.name)
}) })
...@@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op( block.append_op(
type='recv_save', type='recv_save',
attrs={ attrs={
"trainer_id": self.role_maker.worker_index(), "trainer_id": self.role_maker._worker_index(),
"shape": var.shape, "shape": var.shape,
"slice_shapes": slice_shapes, "slice_shapes": slice_shapes,
"slice_varnames": slice_varnames, "slice_varnames": slice_varnames,
...@@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
"is_sparse": True, "is_sparse": True,
"endpoints": var_ctx.split_endpoints(), "endpoints": var_ctx.split_endpoints(),
"pserver_num": "pserver_num":
len(self.role_maker.get_pserver_endpoints()), len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name) "file_path": os.path.join(dirname, var.name)
}) })
...@@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op( block.append_op(
type='recv_save', type='recv_save',
attrs={ attrs={
"trainer_id": self.role_maker.worker_index(), "trainer_id": self.role_maker._worker_index(),
"shape": var.shape, "shape": var.shape,
"slice_shapes": "slice_shapes":
[",".join([str(i) for i in var.shape])], [",".join([str(i) for i in var.shape])],
......
...@@ -197,6 +197,7 @@ def __bootstrap__(): ...@@ -197,6 +197,7 @@ def __bootstrap__():
'free_when_no_cache_hit', 'free_when_no_cache_hit',
'call_stack_level', 'call_stack_level',
'sort_sum_gradient', 'sort_sum_gradient',
'max_inplace_grad_add',
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): ...@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
begin_idx = 0 begin_idx = 0
if end_idx is None: if end_idx is None:
end_idx = len(op_descs) end_idx = len(op_descs)
for i in range(begin_idx, end_idx): if isinstance(op_descs, (list, tuple)):
op_desc = op_descs[i] for i in range(begin_idx, end_idx):
if isinstance(op_desc, tuple): op_desc = op_descs[i]
op_desc = op_desc[0] if isinstance(op_desc, tuple):
op_desc._rename_input(old_name, new_name) op_desc = op_desc[0]
op_desc._rename_output(old_name, new_name) op_desc._rename_input(old_name, new_name)
op_desc._rename_output(old_name, new_name)
if isinstance(op_descs, collections.OrderedDict):
for key, value in op_descs.items():
if isinstance(value, (list, tuple)):
for op_desc in value:
op_desc._rename_input(old_name, new_name)
op_desc._rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs): def _create_op_desc_(op_type, inputs, outputs, attrs):
...@@ -369,6 +376,41 @@ def _append_grad_suffix_(name): ...@@ -369,6 +376,41 @@ def _append_grad_suffix_(name):
return cpt.to_text(name) + core.grad_var_suffix() return cpt.to_text(name) + core.grad_var_suffix()
def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
op_idx):
"""
Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
"""
if op_idx not in pending_sum_ops.keys():
pending_sum_ops[op_idx] = []
pending_sum_ops[op_idx].append(
_create_op_desc_("sum", {"X": renamed_vars[var_name]},
{"Out": [var_name]}, {"use_mkldnn": False}))
renamed_vars[var_name] = [var_name]
def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
op_idx):
"""
Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
"""
if op_idx not in pending_sum_ops.keys():
pending_sum_ops[op_idx] = []
out_name = renamed_vars[var_name][0]
for i in range(1, len(renamed_vars[var_name])):
x_name = out_name
y_name = renamed_vars[var_name][i]
if i != len(renamed_vars[var_name]) - 1:
out_name = var_name + '@ADD@' + str(i)
else:
out_name = var_name
pending_sum_ops[op_idx].append(
_create_op_desc_("grad_add", {"X": [x_name],
"Y": [y_name]}, {"Out": [out_name]},
{"use_mkldnn": False}))
renamed_vars[var_name] = [var_name]
def _addup_repetitive_outputs_(op_descs, block_idx): def _addup_repetitive_outputs_(op_descs, block_idx):
""" """
In backward part, an variable may be the output of more than one ops. In backward part, an variable may be the output of more than one ops.
...@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx): ...@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
In these cases, the variable should be the accumulation of all the outputs. In these cases, the variable should be the accumulation of all the outputs.
`sum_op`s are added to implement the accumulate. `sum_op`s are added to implement the accumulate.
""" """
pending_sum_ops = [] _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
#pending_sum_ops = []
pending_sum_ops = collections.OrderedDict()
var_rename_count = collections.defaultdict(int) var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list) renamed_vars = collections.defaultdict(list)
renamed_var_start_idx = collections.defaultdict(list) renamed_var_start_idx = collections.defaultdict(list)
...@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx): ...@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
if "@GRAD" not in var_name: if "@GRAD" not in var_name:
continue continue
if len(renamed_vars[var_name]) > 1: if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append((_create_op_desc_( if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
"sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]}, _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
{"use_mkldnn": False}), idx)) pending_sum_ops, idx)
renamed_vars[var_name] = [var_name] else:
_accumulate_gradients_by_add_ops_(var_name, renamed_vars,
pending_sum_ops, idx)
for param_idx, param_name in enumerate(op_desc.output_names()): for param_idx, param_name in enumerate(op_desc.output_names()):
arg_names = op_desc.output(param_name) arg_names = op_desc.output(param_name)
for arg_idx, var_name in enumerate(arg_names): for arg_idx, var_name in enumerate(arg_names):
...@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx): ...@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
renamed_vars[var_name].append(new_name) renamed_vars[var_name].append(new_name)
for var_name, inputs in six.iteritems(renamed_vars): for var_name, inputs in six.iteritems(renamed_vars):
if len(inputs) > 1: if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append( if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
(_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]}, _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
{"use_mkldnn": False}), len(op_descs))) pending_sum_ops, len(op_descs))
else:
_accumulate_gradients_by_add_ops_(var_name, renamed_vars,
pending_sum_ops,
len(op_descs))
# sum_op descs are sorted according to their insert position # sum_op descs are sorted according to their insert position
for p in reversed(pending_sum_ops): for key, value in collections.OrderedDict(
op_descs.insert(p[1], p[0]) reversed(list(pending_sum_ops.items()))).items():
# NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct.
# For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b'].
# If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b'].
# If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
idx = key
for i, op in enumerate(value):
op_descs.insert(idx + i, op)
return op_descs return op_descs
......
...@@ -99,7 +99,12 @@ class ImperativeQuantAware(object): ...@@ -99,7 +99,12 @@ class ImperativeQuantAware(object):
self._activation_bits = activation_bits self._activation_bits = activation_bits
self._moving_rate = moving_rate self._moving_rate = moving_rate
quant_type = {'abs_max', 'moving_average_abs_max'} quant_type = {
'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
}
assert activation_quantize_type != 'channel_wise_abs_max', \
"The activation quantization type does not support 'channel_wise_abs_max'."
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be " "Unknown activation_quantize_type : '%s'. It can only be "
...@@ -108,8 +113,8 @@ class ImperativeQuantAware(object): ...@@ -108,8 +113,8 @@ class ImperativeQuantAware(object):
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be " "Unknown weight_quantize_type: '%s'. It can only be "
"'abs_max' or 'moving_average_abs_max' now." % "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
(str(weight_quantize_type))) % (str(weight_quantize_type)))
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
......
...@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype ...@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype
__all__ = [ __all__ = [
'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D', 'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
'QuantizedLinear' 'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
] ]
...@@ -209,6 +209,89 @@ class FakeQuantAbsMax(layers.Layer): ...@@ -209,6 +209,89 @@ class FakeQuantAbsMax(layers.Layer):
return quant_out return quant_out
class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
def __init__(self,
name=None,
channel_num=None,
quant_bits=8,
quant_axis=0,
dtype='float32',
quant_on_weight=False):
assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
self._quant_bits = quant_bits
self._quant_axis = quant_axis
self._dtype = dtype
self._name = name
self._channel_num = channel_num
scale_prefix = "{}.scale".format(
name) if name else 'quant_dequant.scale'
self._scale_name = unique_name.generate(scale_prefix)
if quant_on_weight:
scale_attr = ParamAttr(
name=self._scale_name,
initializer=Constant(0.0),
trainable=False)
self._scale = self.create_parameter(
shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
self._scale.stop_gradient = True
else:
self._scale = None
def forward(self, input):
if in_dygraph_mode():
attrs = ('bit_length', self._quant_bits, 'quant_axis',
self._quant_axis)
quant_out = _varbase_creator(
type=input.type,
name="{}.quantized.dequantized".format(input.name),
shape=input.shape,
dtype=input.dtype,
persistable=False)
out_scale = self._scale
if out_scale is None:
out_scale = _varbase_creator(
type=core.VarDesc.VarType.LOD_TENSOR,
name=self._scale_name,
shape=[self._channel_num],
dtype=self._dtype,
persistable=False)
out_scale.stop_gradient = True
out, _, = core.ops.fake_channel_wise_quantize_dequantize_abs_max(
input, quant_out, out_scale, *attrs)
return out
check_variable_and_dtype(input, 'input', ['float32'],
"FakeChannelWiseQuantDequantAbsMax")
attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
inputs = {"X": [input]}
quant_out = self._helper.create_variable(
name="{}.quantized.dequantized".format(input.name),
dtype=input.dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
out_scale = self._scale
if not out_scale:
out_scale = self._helper.create_variable(
name=self._scale_name,
dtype=self._dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=True)
outputs = {"Out": [quant_out], "OutScale": [out_scale]}
self._helper.append_op(
type="fake_channel_wise_quantize_dequantize_abs_max",
inputs=inputs,
outputs=outputs,
attrs=attrs)
return quant_out
def _get_fake_quant_type(quant_type, **kwargs): def _get_fake_quant_type(quant_type, **kwargs):
call_args = { call_args = {
"name": kwargs.get("name", None), "name": kwargs.get("name", None),
...@@ -220,10 +303,17 @@ def _get_fake_quant_type(quant_type, **kwargs): ...@@ -220,10 +303,17 @@ def _get_fake_quant_type(quant_type, **kwargs):
call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False) call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
elif quant_type == 'moving_average_abs_max': elif quant_type == 'moving_average_abs_max':
call_args["moving_rate"] = kwargs.get("moving_rate", 0.9) call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
elif quant_type == 'channel_wise_abs_max':
call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
call_args["channel_num"] = kwargs.get("channel_num", None)
call_args["quant_axis"] = kwargs.get("quant_axis", 0)
assert call_args["channel_num"] is not None, (
"You need to input channel_num"
"when you use channel_wise_abs_max strategy.")
fake_quant_map = { fake_quant_map = {
'abs_max': FakeQuantAbsMax, 'abs_max': FakeQuantAbsMax,
'moving_average_abs_max': FakeQuantMovingAverage 'moving_average_abs_max': FakeQuantMovingAverage,
'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
} }
return fake_quant_map[quant_type](**call_args) return fake_quant_map[quant_type](**call_args)
...@@ -255,19 +345,23 @@ class QuantizedConv2D(layers.Layer): ...@@ -255,19 +345,23 @@ class QuantizedConv2D(layers.Layer):
self.weight = getattr(layer, 'weight') self.weight = getattr(layer, 'weight')
self.bias = getattr(layer, 'bias') self.bias = getattr(layer, 'bias')
# For FakeQuant # For FakeQuant
self._conv2d_quant_axis = 0
self._fake_quant_weight = _get_fake_quant_type( self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type, weight_quantize_type,
name=self.weight.name, name=self.weight.name,
moving_rate=moving_rate, moving_rate=moving_rate,
quant_bits=weight_bits, quant_bits=weight_bits,
dtype=self._dtype, dtype=self._dtype,
quant_on_weight=True) quant_on_weight=True,
channel_num=self.weight.shape[self._conv2d_quant_axis],
quant_axis=self._conv2d_quant_axis)
self._fake_quant_input = _get_fake_quant_type( self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type, activation_quantize_type,
name=layer.full_name(), name=layer.full_name(),
moving_rate=moving_rate, moving_rate=moving_rate,
quant_bits=activation_bits, quant_bits=activation_bits,
dtype=self._dtype) dtype=self._dtype,
quant_on_weight=False)
def forward(self, input): def forward(self, input):
quant_input = self._fake_quant_input(input) quant_input = self._fake_quant_input(input)
...@@ -341,19 +435,23 @@ class QuantizedLinear(layers.Layer): ...@@ -341,19 +435,23 @@ class QuantizedLinear(layers.Layer):
self.weight = getattr(layer, 'weight') self.weight = getattr(layer, 'weight')
self.bias = getattr(layer, 'bias') self.bias = getattr(layer, 'bias')
# For FakeQuant # For FakeQuant
self._linear_quant_axis = 1
self._fake_quant_weight = _get_fake_quant_type( self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type, weight_quantize_type,
name=self.weight.name, name=self.weight.name,
moving_rate=moving_rate, moving_rate=moving_rate,
quant_bits=weight_bits, quant_bits=weight_bits,
dtype=self._dtype, dtype=self._dtype,
quant_on_weight=True) quant_on_weight=True,
channel_num=self.weight.shape[self._linear_quant_axis],
quant_axis=self._linear_quant_axis)
self._fake_quant_input = _get_fake_quant_type( self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type, activation_quantize_type,
name=layer.full_name(), name=layer.full_name(),
moving_rate=moving_rate, moving_rate=moving_rate,
quant_bits=activation_bits, quant_bits=activation_bits,
dtype=self._dtype) dtype=self._dtype,
quant_on_weight=False)
def forward(self, input): def forward(self, input):
quant_input = self._fake_quant_input(input) quant_input = self._fake_quant_input(input)
......
...@@ -181,7 +181,6 @@ class TestImperativeQat(unittest.TestCase): ...@@ -181,7 +181,6 @@ class TestImperativeQat(unittest.TestCase):
img = fluid.dygraph.to_variable(x_data) img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data) label = fluid.dygraph.to_variable(y_data)
out = lenet(img) out = lenet(img)
acc = fluid.layers.accuracy(out, label) acc = fluid.layers.accuracy(out, label)
loss = fluid.layers.cross_entropy(out, label) loss = fluid.layers.cross_entropy(out, label)
......
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from __future__ import print_function
import os
import numpy as np
import random
import unittest
import logging
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.framework import IrGraph
from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
from paddle.fluid.dygraph.container import Sequential
from paddle.fluid.dygraph.nn import Conv2D
from paddle.fluid.dygraph.nn import Pool2D
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.log_helper import get_logger
os.environ["CPU_NUM"] = "1"
if core.is_compiled_with_cuda():
fluid.set_flags({"FLAGS_cudnn_deterministic": True})
_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
conv1 = fluid.layers.conv2d(
data,
num_filters=6,
filter_size=3,
stride=1,
padding=1,
param_attr=conv2d_w1_attr,
bias_attr=conv2d_b1_attr)
pool1 = fluid.layers.pool2d(
conv1, pool_size=2, pool_type='max', pool_stride=2)
conv2 = fluid.layers.conv2d(
pool1,
num_filters=16,
filter_size=5,
stride=1,
padding=0,
param_attr=conv2d_w2_attr,
bias_attr=conv2d_b2_attr)
pool2 = fluid.layers.pool2d(
conv2, pool_size=2, pool_type='max', pool_stride=2)
fc1 = fluid.layers.fc(input=pool2,
size=120,
param_attr=fc_w1_attr,
bias_attr=fc_b1_attr)
fc2 = fluid.layers.fc(input=fc1,
size=84,
param_attr=fc_w2_attr,
bias_attr=fc_b2_attr)
fc3 = fluid.layers.fc(input=fc2,
size=num_classes,
act=classifier_activation,
param_attr=fc_w3_attr,
bias_attr=fc_b3_attr)
return fc3
class ImperativeLenet(fluid.dygraph.Layer):
def __init__(self, num_classes=10, classifier_activation='softmax'):
super(ImperativeLenet, self).__init__()
conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
self.features = Sequential(
Conv2D(
num_channels=1,
num_filters=6,
filter_size=3,
stride=1,
padding=1,
param_attr=conv2d_w1_attr,
bias_attr=conv2d_b1_attr),
Pool2D(
pool_size=2, pool_type='max', pool_stride=2),
Conv2D(
num_channels=6,
num_filters=16,
filter_size=5,
stride=1,
padding=0,
param_attr=conv2d_w2_attr,
bias_attr=conv2d_b2_attr),
Pool2D(
pool_size=2, pool_type='max', pool_stride=2))
self.fc = Sequential(
Linear(
input_dim=400,
output_dim=120,
param_attr=fc_w1_attr,
bias_attr=fc_b1_attr),
Linear(
input_dim=120,
output_dim=84,
param_attr=fc_w2_attr,
bias_attr=fc_b2_attr),
Linear(
input_dim=84,
output_dim=num_classes,
act=classifier_activation,
param_attr=fc_w3_attr,
bias_attr=fc_b3_attr))
def forward(self, inputs):
x = self.features(inputs)
x = fluid.layers.flatten(x, 1)
x = self.fc(x)
return x
class TestImperativeQat(unittest.TestCase):
"""
QAT = quantization-aware training
"""
def test_qat_save(self):
imperative_qat = ImperativeQuantAware(
weight_quantize_type='channel_wise_abs_max',
activation_quantize_type='moving_average_abs_max')
with fluid.dygraph.guard():
lenet = ImperativeLenet()
imperative_qat.quantize(lenet)
adam = AdamOptimizer(
learning_rate=0.001, parameter_list=lenet.parameters())
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=32)
epoch_num = 1
for epoch in range(epoch_num):
lenet.train()
for batch_id, data in enumerate(train_reader()):
x_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
out = lenet(img)
acc = fluid.layers.accuracy(out, label)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
adam.minimize(avg_loss)
lenet.clear_gradients()
if batch_id % 100 == 0:
_logger.info(
"Train | At epoch {} step {}: loss = {:}, acc= {:}".
format(epoch, batch_id,
avg_loss.numpy(), acc.numpy()))
lenet.eval()
for batch_id, data in enumerate(test_reader()):
x_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
out = lenet(img)
acc_top1 = fluid.layers.accuracy(
input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(
input=out, label=label, k=5)
if batch_id % 100 == 0:
_logger.info(
"Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
format(epoch, batch_id,
acc_top1.numpy(), acc_top5.numpy()))
# save weights
model_dict = lenet.state_dict()
fluid.save_dygraph(model_dict, "save_temp")
# test the correctness of `paddle.jit.save`
data = next(test_reader())
test_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
test_img = fluid.dygraph.to_variable(test_data)
lenet.eval()
before_save = lenet(test_img)
# save inference quantized model
path = "./mnist_infer_model"
paddle.jit.save(
layer=lenet,
model_path=path,
input_spec=[
paddle.static.InputSpec(
shape=[None, 1, 28, 28], dtype='float32')
])
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
exe = fluid.Executor(place)
[inference_program, feed_target_names, fetch_targets] = (
fluid.io.load_inference_model(
dirname=path,
executor=exe,
model_filename="__model__",
params_filename="__variables__"))
after_save, = exe.run(inference_program,
feed={feed_target_names[0]: test_data},
fetch_list=fetch_targets)
self.assertTrue(
np.allclose(after_save, before_save.numpy()),
msg='Failed to save the inference quantized model.')
def test_qat_acc(self):
def _build_static_lenet(main, startup, is_test=False, seed=1000):
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
main.random_seed = seed
startup.random_seed = seed
img = fluid.layers.data(
name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
prediction = StaticLenet(img)
if not is_test:
loss = fluid.layers.cross_entropy(
input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
else:
avg_loss = prediction
return img, label, avg_loss
reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
weight_quantize_type = 'channel_wise_abs_max'
activation_quant_type = 'moving_average_abs_max'
param_init_map = {}
seed = 1000
lr = 0.1
# imperative train
_logger.info(
"--------------------------dynamic graph qat--------------------------"
)
imperative_qat = ImperativeQuantAware(
weight_quantize_type=weight_quantize_type,
activation_quantize_type=activation_quant_type)
with fluid.dygraph.guard():
np.random.seed(seed)
fluid.default_main_program().random_seed = seed
fluid.default_startup_program().random_seed = seed
lenet = ImperativeLenet()
fixed_state = {}
for name, param in lenet.named_parameters():
p_shape = param.numpy().shape
p_value = param.numpy()
if name.endswith("bias"):
value = np.zeros_like(p_value).astype('float32')
else:
value = np.random.normal(
loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
p_shape).astype('float32')
fixed_state[name] = value
param_init_map[param.name] = value
lenet.set_dict(fixed_state)
imperative_qat.quantize(lenet)
adam = AdamOptimizer(
learning_rate=lr, parameter_list=lenet.parameters())
dynamic_loss_rec = []
lenet.train()
for batch_id, data in enumerate(reader()):
x_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
out = lenet(img)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
adam.minimize(avg_loss)
lenet.clear_gradients()
dynamic_loss_rec.append(avg_loss.numpy()[0])
if batch_id % 100 == 0:
_logger.info('{}: {}'.format('loss', avg_loss.numpy()))
paddle.jit.save(
layer=lenet,
model_path="./dynamic_mnist",
input_spec=[
paddle.static.InputSpec(
shape=[None, 1, 28, 28], dtype='float32')
])
# static graph train
_logger.info(
"--------------------------static graph qat--------------------------"
)
static_loss_rec = []
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
exe = fluid.Executor(place)
main = fluid.Program()
infer = fluid.Program()
startup = fluid.Program()
static_img, static_label, static_loss = _build_static_lenet(
main, startup, False, seed)
infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
seed)
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
opt = AdamOptimizer(learning_rate=lr)
opt.minimize(static_loss)
scope = core.Scope()
with fluid.scope_guard(scope):
exe.run(startup)
for param in main.all_parameters():
param_tensor = scope.var(param.name).get_tensor()
param_tensor.set(param_init_map[param.name], place)
main_graph = IrGraph(core.Graph(main.desc), for_test=False)
infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=scope,
place=place,
activation_quantize_type=activation_quant_type,
weight_quantize_type=weight_quantize_type,
quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
transform_pass.apply(main_graph)
transform_pass.apply(infer_graph)
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
loss_name=static_loss.name, build_strategy=build_strategy)
feeder = fluid.DataFeeder(
feed_list=[static_img, static_label], place=place)
with fluid.scope_guard(scope):
for batch_id, data in enumerate(reader()):
loss_v, = exe.run(binary,
feed=feeder.feed(data),
fetch_list=[static_loss])
static_loss_rec.append(loss_v[0])
if batch_id % 100 == 0:
_logger.info('{}: {}'.format('loss', loss_v))
save_program = infer_graph.to_program()
with fluid.scope_guard(scope):
fluid.io.save_inference_model("./static_mnist", [infer_img.name],
[infer_pre], exe, save_program)
rtol = 1e-05
atol = 1e-08
for i, (loss_d,
loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
diff = np.abs(loss_d - loss_s)
if diff > (atol + rtol * np.abs(loss_s)):
_logger.info(
"diff({}) at {}, dynamic loss = {}, static loss = {}".
format(diff, i, loss_d, loss_s))
break
self.assertTrue(
np.allclose(
np.array(dynamic_loss_rec),
np.array(static_loss_rec),
rtol=rtol,
atol=atol,
equal_nan=True),
msg='Failed to do the imperative qat.')
if __name__ == '__main__':
unittest.main()
...@@ -170,22 +170,40 @@ class CompileTimeStrategy(object): ...@@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
return trainer.mode == DistributedMode.ASYNC return trainer.mode == DistributedMode.ASYNC
def get_role_id(self): def get_role_id(self):
return self.role_maker.role_id() try:
return self.role_maker._role_id()
except Exception:
return self.role_maker.role_id()
def get_trainers(self): def get_trainers(self):
return self.role_maker.worker_num() try:
return self.role_maker._worker_num()
except Exception:
return self.role_maker.worker_num()
def get_ps_endpoint(self): def get_ps_endpoint(self):
return self.role_maker.get_pserver_endpoints()[self.get_role_id()] try:
return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
except Exception:
return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
def get_ps_endpoints(self): def get_ps_endpoints(self):
return self.role_maker.get_pserver_endpoints() try:
return self.role_maker._get_pserver_endpoints()
except Exception:
return self.role_maker.get_pserver_endpoints()
def get_heter_worker_endpoints(self): def get_heter_worker_endpoints(self):
return self.role_maker._get_heter_worker_endpoints() try:
return self.role_maker._get_heter_worker_endpoints()
except Exception:
return self.role_maker.get_heter_worker_endpoints()
def get_heter_worker_endpoint(self): def get_heter_worker_endpoint(self):
return self.role_maker._get_heter_worker_endpoint() try:
return self.role_maker._get_heter_worker_endpoint()
except Exception:
return self.role_maker.get_heter_worker_endpoint()
def get_origin_programs(self): def get_origin_programs(self):
return self.origin_main_program, self.origin_startup_program return self.origin_main_program, self.origin_startup_program
......
...@@ -680,8 +680,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): ...@@ -680,8 +680,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
if not isinstance(value, Variable): if not isinstance(value, Variable):
if dtype in ['int64', 'int32']: if dtype in ['int64', 'int32']:
attrs['str_value'] = str(int(value)) attrs['str_value'] = str(int(value))
attrs['value'] = int(value)
else: else:
attrs['str_value'] = str(float(value)) attrs['str_value'] = str(float(value))
attrs['value'] = float(value)
if in_dygraph_mode(): if in_dygraph_mode():
shape = utils.convert_shape_to_list(shape) shape = utils.convert_shape_to_list(shape)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class ConvAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding=[1, 1, 1, 1],
bias_attr=False,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
class ConvAffineChannelFusePassValidPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding='VALID',
bias_attr=False,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
class ConvAffineChannelFusePassSamePaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding='SAME',
bias_attr=False,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
class ConvEltwiseAddAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding=[1, 1, 1, 1],
bias_attr=param_attr,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_eltwiseadd_affine_channel_fuse_pass'))
class ConvEltwiseAddAffineChannelFusePassValidPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding='VALID',
bias_attr=param_attr,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_eltwiseadd_affine_channel_fuse_pass'))
class ConvEltwiseAddAffineChannelFusePassSamePaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=3,
padding='Same',
bias_attr=param_attr,
act=None)
input_scale = fluid.layers.create_parameter(
shape=[3], dtype="float32")
input_bias = fluid.layers.create_parameter(
shape=[3], dtype="float32")
ac_out = fluid.layers.affine_channel(
x=conv_out, scale=input_scale, bias=input_bias)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [ac_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_eltwiseadd_affine_channel_fuse_pass'))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class ConvBnFusePassExplicitPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding=[1, 1, 1, 1],
bias_attr=False,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
class ConvBnFusePassValidPaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding='VALID',
bias_attr=False,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
class ConvBnFusePassSamePaddingTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding='SAME',
bias_attr=False,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
class ConvEltwiseAddBnFuseExplicitPaddingPass(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding=[1, 1, 1, 1],
bias_attr=None,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
class ConvEltwiseAddBnFuseValidPaddingPass(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding='VALID',
bias_attr=None,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
class ConvEltwiseAddBnFuseSamePaddingPass(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=6,
filter_size=6,
groups=3,
padding='SAME',
bias_attr=None,
act=None)
bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [bn_out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
if __name__ == "__main__":
unittest.main()
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv, elementwise_add and 2 act.""" """Test for fusion of conv, elementwise_add and 2 act."""
...@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest): ...@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
self.check_output_with_option(use_gpu) self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_elementwise_add2_act_fuse_pass'))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv, elementwise_add and act.""" """Test for fusion of conv, elementwise_add and act."""
...@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest): ...@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
self.check_output_with_option(use_gpu) self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_elementwise_add_act_fuse_pass'))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv and elementwise_add.""" """Test for fusion of conv and elementwise_add."""
...@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest): ...@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
self.check_output_with_option(use_gpu) self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class FcFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 128, 768], dtype="float32")
data_y = fluid.data(name="y", shape=[-1, 128, 768], dtype="float32")
fc_out1 = fluid.layers.fc(input=data,
size=3072,
num_flatten_dims=2,
act="relu")
fc_out2 = fluid.layers.fc(input=fc_out1,
size=768,
num_flatten_dims=2)
self.feeds = {"data": np.random.random((4, 128, 768)).astype("float32")}
self.fetch_list = [fc_out2]
def test_check_output(self):
use_gpu = [False]
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.assertTrue(PassVersionChecker.IsCompatible('fc_fuse_pass'))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class FcGruFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
dict_dim, emb_dim = 128, 64
data = fluid.data(
name='step_data', shape=[None], dtype='int64', lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
hidden = fluid.layers.dynamic_gru(
input=x,
size=hidden_dim,
bias_attr=True,
origin_mode=False,
is_reverse=True)
batch = 16
lod_tensor = fluid.LoDTensor()
lod_tensor.set(np.random.randint(
0, dict_dim, size=[batch]).astype("int64"),
fluid.CPUPlace())
lod_tensor.set_lod([[0, batch]])
self.feeds = {"step_data": lod_tensor}
self.fetch_list = [hidden]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(PassVersionChecker.IsCompatible('fc_gru_fuse_pass'))
class MulGruFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
dict_dim, emb_dim = 128, 64
data = fluid.data(
name='step_data', shape=[None], dtype='int64', lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3, bias_attr=False)
hidden = fluid.layers.dynamic_gru(
input=x,
size=hidden_dim,
bias_attr=True,
origin_mode=False,
is_reverse=True)
batch = 16
lod_tensor = fluid.LoDTensor()
lod_tensor.set(np.random.randint(
0, dict_dim, size=[batch]).astype("int64"),
fluid.CPUPlace())
lod_tensor.set_lod([[0, batch]])
self.feeds = {"step_data": lod_tensor}
self.fetch_list = [hidden]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(PassVersionChecker.IsCompatible('mul_gru_fuse_pass'))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class MulLstmFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
dict_dim, emb_dim = 128, 64
hidden_dim = 512
data = fluid.data(
name='data', shape=[1], dtype='int64', lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
forward, cell = fluid.layers.dynamic_lstm(
input=x, size=hidden_dim * 4)
batch = 16
lod_tensor = fluid.LoDTensor()
lod_tensor.set(np.random.randint(
0, dict_dim, size=[batch]).astype("int64"),
fluid.CPUPlace())
lod_tensor.set_lod([[0, batch]])
self.feeds = {"data": lod_tensor}
self.fetch_list = [forward, cell]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(PassVersionChecker.IsCompatible('mul_lstm_fuse_pass'))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class RepeatedFcReluFusePass3Test(InferencePassTest):
def setUp(self):
fc_num = 3
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
bias_attr=param_attr,
act=None)
fc_outs = []
fc_outs.append(
fluid.layers.fc(input=[conv_out], act="relu", size=1000))
for i in range(1, fc_num):
fc_outs.append(
fluid.layers.fc(
input=[fc_outs[i - 1]], act="relu", size=1000))
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [fc_outs[fc_num - 1]]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
class RepeatedFcReluFusePass9Test(InferencePassTest):
def setUp(self):
fc_num = 9
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
param_attr = fluid.ParamAttr(
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
bias_attr=param_attr,
act=None)
fc_outs = []
fc_outs.append(
fluid.layers.fc(input=[conv_out], act="relu", size=1000))
for i in range(1, fc_num):
fc_outs.append(
fluid.layers.fc(
input=[fc_outs[i - 1]], act="relu", size=1000))
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.fetch_list = [fc_outs[fc_num - 1]]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class SquaredMatSubFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data_a = fluid.data(name="data_a", shape=[128, 1], dtype="float32")
data_b = fluid.data(name="data_b", shape=[256, 1], dtype="float32")
fc_a = fluid.layers.fc(data_a, size=256)
fc_b = fluid.layers.fc(data_b, size=64)
data_a_square = paddle.square(fc_a)
data_b_square = paddle.square(fc_b)
matmul_ab = paddle.matmul(fc_a, fc_b)
matmul_ab_square = paddle.square(matmul_ab)
matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
scale = paddle.fill_constant(shape=[1], value=0.5, dtype='float32')
sub_val = paddle.elementwise_sub(matmul_ab_square, matmul_square_ab)
squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
self.feeds = {
"data_a": np.random.random((128, 1)).astype("float32"),
"data_b": np.random.random((256, 1)).astype("float32")
}
self.fetch_list = [squared_mat_sub_out]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('squared_mat_sub_fuse_pass'))
if __name__ == "__main__":
unittest.main()
...@@ -75,7 +75,9 @@ class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest): ...@@ -75,7 +75,9 @@ class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
use_gpu = True use_gpu = True
self.check_output_with_option(use_gpu) self.check_output_with_option(use_gpu)
PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass') self.assertTrue(
PassVersionChecker.IsCompatible(
'transpose_flatten_concat_fuse_pass'))
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig
class ShuffleChannelFuseTRTPassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 6, 64, 64], dtype="float32")
reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
out = fluid.layers.batch_norm(reshape2, is_test=True)
self.feeds = {
"data": np.random.random([1, 6, 64, 64]).astype("float32"),
}
self.enable_trt = True
self.trt_parameters = ShuffleChannelFuseTRTPassTest.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out]
def test_check_output(self):
self.check_output()
self.assertTrue(
PassVersionChecker.IsCompatible('shuffle_channel_detect_pass'))
if __name__ == "__main__":
unittest.main()
...@@ -306,5 +306,70 @@ class TestFakeQuantDequantAbsOp(OpTest): ...@@ -306,5 +306,70 @@ class TestFakeQuantDequantAbsOp(OpTest):
self.check_grad(["X"], "Out", user_defined_grads=gradient) self.check_grad(["X"], "Out", user_defined_grads=gradient)
class TestChannelWiseFakeQuantDequantOp(OpTest):
def setUp(self):
self.set_arg()
assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
scales = []
outputs = self.inputs['X'].copy()
range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
if self.quant_axis == 0:
for i in range(self.inputs['X'].shape[0]):
scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
scales.append(scale_v)
outputs[i] = np.round(outputs[i] * range_v /
scale_v) * scale_v / range_v
elif self.quant_axis == 1:
for i in range(self.inputs['X'].shape[1]):
scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
"float32")
scales.append(scale_v)
outputs[:, i] = np.round(outputs[:, i] * range_v /
scale_v) * scale_v / range_v
self.outputs = {
'Out': outputs,
'OutScale': np.array(scales).astype("float32"),
}
def set_arg(self):
self.quant_axis = 0
self.inputs = {
'X': np.random.random((3, 4, 64, 64)).astype("float32"),
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
x = self.inputs["X"]
gradient = [np.ones(x.shape) / np.product(x.shape)]
self.check_grad(["X"], "Out", user_defined_grads=gradient)
class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
def set_arg(self):
self.quant_axis = 1
self.inputs = {
'X': np.random.random((15, 20, 5, 5)).astype("float32"),
}
class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
def set_arg(self):
self.quant_axis = 0
self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
def set_arg(self):
self.quant_axis = 1
self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -24,10 +24,10 @@ import numpy as np ...@@ -24,10 +24,10 @@ import numpy as np
class TestFleetBase(unittest.TestCase): class TestFleetBase(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ["POD_IP"] = "127.0.0.1" os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001" "127.0.0.1:36001,127.0.0.2:36002"
def test_init(self): def test_init(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
...@@ -58,37 +58,56 @@ class TestFleetBase(unittest.TestCase): ...@@ -58,37 +58,56 @@ class TestFleetBase(unittest.TestCase):
def test_worker_endpoints(self): def test_worker_endpoints(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
print(fleet.worker_endpoints(to_string=True)) self.assertEqual(
"127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
def test_server_num(self): def test_server_num(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
if fleet.is_server(): os.environ["PADDLE_TRAINERS_NUM"] = "2"
print("fleet server num: {}".format(fleet.server_num())) self.assertEqual(2, fleet.server_num())
def test_server_index(self): def test_server_index(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
if fleet.is_server(): self.assertEqual(0, fleet.server_index())
print("fleet server index: {}".format(fleet.server_index()))
def test_server_endpoints(self): def test_server_endpoints(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
if fleet.is_server(): if fleet.is_server():
print("fleet server index: {}".format( self.assertEqual(
fleet.server_endpoints(to_string=True))) "127.0.0.1:36001,127.0.0.2:36002",
fleet.server_endpoints(to_string=True))
self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
fleet.server_endpoints())
def test_is_server(self): def test_is_server(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
if fleet.is_server(): self.assertTrue(fleet.is_server())
print("test fleet is server")
def test_util(self): def test_util(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
self.assertEqual(fleet.util, None) self.assertEqual(fleet.util(), None)
def test_barrier_worker(self): def test_barrier_worker(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
...@@ -99,20 +118,17 @@ class TestFleetBase(unittest.TestCase): ...@@ -99,20 +118,17 @@ class TestFleetBase(unittest.TestCase):
def test_init_worker(self): def test_init_worker(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
if fleet.is_worker():
fleet.init_worker()
def test_run_server(self): with self.assertRaises(ValueError):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) if fleet.is_worker():
fleet.init(role) fleet.init_worker()
if fleet.is_worker():
fleet.run_worker()
def test_stop_worker(self): def test_stop_worker(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
if fleet.is_worker(): with self.assertRaises(ValueError):
fleet.stop_worker() if fleet.is_worker():
fleet.stop_worker()
def test_distributed_optimizer(self): def test_distributed_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
......
...@@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase): ...@@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
role2._all_gather(1) role2._all_gather(1)
role2._all_gather(1) role2._all_gather(1)
role2._barrier_server() role2._barrier_server()
role2.all_gather(1) role2._all_gather(1)
role3 = GeneralRoleMaker(path="./test_gloo_3") role3 = GeneralRoleMaker(path="./test_gloo_3")
role3._worker_gather(1) role3._worker_gather(1)
role3._worker_gather(1) role3._worker_gather(1)
......
...@@ -15,7 +15,11 @@ ...@@ -15,7 +15,11 @@
from __future__ import print_function from __future__ import print_function
import os import os
import platform
import shutil
import tempfile
import unittest import unittest
import paddle
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
...@@ -26,25 +30,25 @@ class TestRoleMakerBase(unittest.TestCase): ...@@ -26,25 +30,25 @@ class TestRoleMakerBase(unittest.TestCase):
def test_rolemaker_base(self): def test_rolemaker_base(self):
role = role_maker.RoleMakerBase() role = role_maker.RoleMakerBase()
self.assertRaises(Exception, role.is_worker) self.assertRaises(Exception, role._is_worker)
self.assertRaises(Exception, role.is_server) self.assertRaises(Exception, role._is_server)
self.assertRaises(Exception, role.is_first_worker) self.assertRaises(Exception, role._is_first_worker)
self.assertRaises(Exception, role.worker_num) self.assertRaises(Exception, role._worker_num)
self.assertRaises(Exception, role.server_num) self.assertRaises(Exception, role._server_num)
self.assertRaises(Exception, role.worker_index) self.assertRaises(Exception, role._worker_index)
self.assertRaises(Exception, role.server_index) self.assertRaises(Exception, role._server_index)
self.assertRaises(Exception, role.role_id) self.assertRaises(Exception, role._role_id)
self.assertRaises(Exception, role.node_num) self.assertRaises(Exception, role._node_num)
trainer_endpoints = role.get_trainer_endpoints() trainer_endpoints = role._get_trainer_endpoints()
self.assertTrue(len(trainer_endpoints) == 0) self.assertTrue(len(trainer_endpoints) == 0)
pserver_endpoints = role.get_pserver_endpoints() pserver_endpoints = role._get_pserver_endpoints()
self.assertTrue(len(pserver_endpoints) == 0) self.assertTrue(len(pserver_endpoints) == 0)
print(role.to_string()) print(role.to_string())
self.assertTrue(role._all_gather(role._node_type_comm, 1) is None) self.assertTrue(role._all_gather(1, "worker") is None)
self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None) self.assertTrue(role._all_reduce(1, "sum", "worker") is None)
role._barrier(role._node_type_comm) role._barrier("worker")
class TestCloudRoleMaker(unittest.TestCase): class TestCloudRoleMaker(unittest.TestCase):
...@@ -72,21 +76,33 @@ class TestCloudRoleMaker(unittest.TestCase): ...@@ -72,21 +76,33 @@ class TestCloudRoleMaker(unittest.TestCase):
print("warning: no netifaces, skip test_tr_rolemaker") print("warning: no netifaces, skip test_tr_rolemaker")
return return
ro = role_maker.PaddleCloudRoleMaker( ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
is_collective=False, init_gloo=False) self.assertTrue(ro._is_worker())
self.assertTrue(ro.is_worker()) ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertFalse(ro.is_server()) self.assertFalse(ro._is_server())
self.assertEqual(ro.worker_num(), 2) ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertTrue(ro.is_first_worker()) self.assertEqual(ro._worker_num(), 2)
worker_endpoints = ro.get_trainer_endpoints() ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertTrue(ro._is_first_worker())
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
worker_endpoints = ro._get_trainer_endpoints()
self.assertEqual(worker_endpoints[0], '127.0.0.1:36001') self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
self.assertEqual(ro.role_id(), 0) ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertEqual(ro.node_num(), 2) self.assertEqual(ro._role_id(), 0)
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertEqual(ro._node_num(), 2)
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertFalse(ro._is_non_distributed())
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertEqual(ro._heter_worker_num(), 0)
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertFalse(ro._is_heter_worker())
def test_tr_rolemaker_collective(self): def test_tr_rolemaker_collective(self):
ro = role_maker.PaddleCloudRoleMaker(is_collective=True) ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
self.assertEqual(ro.worker_num(), 2) self.assertEqual(ro._worker_num(), 2)
self.assertEqual(ro.node_num(), 2) ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
self.assertEqual(ro._node_num(), 2)
def test_ps_rolemaker(self): def test_ps_rolemaker(self):
"""Test ps rolemaker.""" """Test ps rolemaker."""
...@@ -102,14 +118,15 @@ class TestCloudRoleMaker(unittest.TestCase): ...@@ -102,14 +118,15 @@ class TestCloudRoleMaker(unittest.TestCase):
ro = role_maker.PaddleCloudRoleMaker( ro = role_maker.PaddleCloudRoleMaker(
is_collective=False, init_gloo=False) is_collective=False, init_gloo=False)
self.assertEqual(ro.server_index(), 0) self.assertEqual(ro._server_index(), 0)
self.assertFalse(ro.is_worker()) self.assertFalse(ro._is_worker())
self.assertTrue(ro.is_server()) self.assertTrue(ro._is_server())
self.assertEqual(ro.server_num(), 2) self.assertEqual(ro._server_num(), 2)
pserver_endpoints = ro.get_pserver_endpoints() pserver_endpoints = ro._get_pserver_endpoints()
self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001') self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None) self.assertEqual(ro._all_gather(1, "worker"), 1)
self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
def test_traing_role(self): def test_traing_role(self):
"""Test training role.""" """Test training role."""
...@@ -121,7 +138,7 @@ class TestCloudRoleMaker(unittest.TestCase): ...@@ -121,7 +138,7 @@ class TestCloudRoleMaker(unittest.TestCase):
return return
ro = role_maker.PaddleCloudRoleMaker(is_collective=False) ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro.generate_role) self.assertRaises(ValueError, ro._generate_role)
class TestUserDefinedRoleMaker(unittest.TestCase): class TestUserDefinedRoleMaker(unittest.TestCase):
...@@ -142,14 +159,14 @@ class TestUserDefinedRoleMaker(unittest.TestCase): ...@@ -142,14 +159,14 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
ro = role_maker.UserDefinedRoleMaker( ro = role_maker.UserDefinedRoleMaker(
is_collective=False, is_collective=False,
init_gloo=False, init_gloo=False,
server_endpoints="127.0.0.1:36001,127.0.0.1:36001", server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
role=role_maker.Role.SERVER, role=role_maker.Role.SERVER,
current_id=0, current_id=0,
worker_num=2) worker_num=2)
self.assertEqual(ro.server_num(), 2) self.assertEqual(ro._server_num(), 2)
ro.generate_role() ro._generate_role()
self.assertTrue(ro.is_server()) self.assertTrue(ro._is_server())
self.assertEqual(ro.role_id(), 0) self.assertEqual(ro._role_id(), 0)
def test_tr_rolemaker(self): def test_tr_rolemaker(self):
try: try:
...@@ -161,13 +178,589 @@ class TestUserDefinedRoleMaker(unittest.TestCase): ...@@ -161,13 +178,589 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
ro = role_maker.UserDefinedRoleMaker( ro = role_maker.UserDefinedRoleMaker(
is_collective=False, is_collective=False,
init_gloo=False, init_gloo=False,
server_endpoints="127.0.0.1:36001,127.0.0.1:36001", server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
role=role_maker.Role.WORKER, role=role_maker.Role.WORKER,
current_id=0, current_id=0,
worker_num=2) worker_num=2)
self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
self.assertTrue(ro.is_worker()) self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints())
self.assertEqual(ro.role_id(), 0) self.assertTrue(ro._is_worker())
self.assertEqual(ro._role_id(), 0)
class TestGlooWithCloudRoleMaker(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def case(self, role, comm_world):
role._barrier(comm_world)
gather = role._all_gather(1, comm_world)
self.assertEqual(gather[0], 1)
all_reduce = role._all_reduce(1, "sum", comm_world)
self.assertEqual(1, all_reduce)
def mkdir(self):
tmp = tempfile.mkdtemp()
return tmp
def clean(self, tmp):
shutil.rmtree(tmp)
def test_hdfs_gloo(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "worker")
self.clean(tmp)
def test_fs_gloo(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "worker")
self.clean(tmp)
def test_fs_gloo2(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.clean(tmp)
def test_fs_gloo3(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.clean(tmp)
def test_fs_gloo4(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
import time
time.sleep(3)
def test_fs_gloo5(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.case(role, "all")
self.clean(tmp)
def test_fs_gloo6(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.case(role, "all")
self.clean(tmp)
def test_fs_gloo7(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
role = role_maker.PaddleCloudRoleMaker()
self.assertRaises(ValueError, role._generate_role)
def test_fs_gloo8(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
def net():
x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = paddle.fluid.layers.square_error_cost(
input=y_predict, label=y)
avg_cost = paddle.fluid.layers.mean(cost)
return avg_cost
from paddle.distributed import fleet
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
avg_cost = net()
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
optimizer = paddle.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
comm_world = "server"
fleet.util().barrier(comm_world)
gather = fleet.util().all_gather(1, comm_world)
self.assertEqual(gather[0], 1)
all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
self.assertEqual(1, all_reduce)
self.clean(tmp)
class TestGlooWithCloudRoleMaker(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def case(self, role, comm_world):
role._barrier(comm_world)
gather = role._all_gather(1, comm_world)
self.assertEqual(gather[0], 1)
all_reduce = role._all_reduce(1, "sum", comm_world)
self.assertEqual(1, all_reduce)
def mkdir(self):
tmp = tempfile.mkdtemp()
return tmp
def clean(self, tmp):
shutil.rmtree(tmp)
def test_hdfs_gloo(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "worker")
self.clean(tmp)
def test_fs_gloo(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "worker")
self.clean(tmp)
def test_fs_gloo2(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.clean(tmp)
def test_fs_gloo3(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.clean(tmp)
def test_fs_gloo4(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
import time
time.sleep(3)
def test_fs_gloo5(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.case(role, "all")
self.clean(tmp)
def test_fs_gloo6(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
role = role_maker.PaddleCloudRoleMaker()
role._generate_role()
self.case(role, "server")
self.case(role, "all")
self.clean(tmp)
def test_fs_gloo7(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
role = role_maker.PaddleCloudRoleMaker()
self.assertRaises(ValueError, role._generate_role)
def test_hdfs_gloo_v2(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = ""
os.environ["PADDLE_GLOO_FS_UGI"] = ""
os.environ["PADDLE_GLOO_FS_PATH"] = ""
role = role_maker.PaddleCloudRoleMaker()
self.assertRaises(ValueError, role._generate_role)
def test_fs_gloo_v2(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
os.environ["PADDLE_GLOO_FS_PATH"] = ""
role = role_maker.PaddleCloudRoleMaker()
self.assertRaises(ValueError, role._generate_role)
def test_http_gloo_v2(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "1"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
os.environ["PADDLE_GLOO_HTTP_HOST"] = ""
os.environ["PADDLE_GLOO_HTTP_PORT"] = ""
role = role_maker.PaddleCloudRoleMaker()
self.assertRaises(ValueError, role._generate_role)
def test_fs_gloo8(self):
plats = platform.platform()
if 'Linux' not in plats:
print("skip gloo UT on MacOS/Win")
return
tmp = self.mkdir()
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINERS_NUM"] = "0"
os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
os.environ["PADDLE_WITH_GLOO"] = "2"
os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
os.environ["PADDLE_GLOO_FS_PATH"] = tmp
def net():
x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = paddle.fluid.layers.square_error_cost(
input=y_predict, label=y)
avg_cost = paddle.fluid.layers.mean(cost)
return avg_cost
from paddle.distributed import fleet
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
avg_cost = net()
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
optimizer = paddle.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
comm_world = "server"
fleet.util().barrier(comm_world)
gather = fleet.util().all_gather(1, comm_world)
self.assertEqual(gather[0], 1)
all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
self.assertEqual(1, all_reduce)
self.clean(tmp)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase): ...@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase):
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
default_util = fleet.util default_util = fleet.util()
self.assertEqual(default_util, None) self.assertEqual(default_util, None)
def test_set_user_defined_util(self): def test_set_user_defined_util(self):
...@@ -76,8 +76,8 @@ class TestFleetUtil(unittest.TestCase): ...@@ -76,8 +76,8 @@ class TestFleetUtil(unittest.TestCase):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
my_util = UserDefinedUtil() my_util = UserDefinedUtil()
fleet.util = my_util fleet.set_util(my_util)
user_id = fleet.util.get_user_id() user_id = fleet.util().get_user_id()
self.assertEqual(user_id, 10) self.assertEqual(user_id, 10)
def test_fs(self): def test_fs(self):
...@@ -88,97 +88,6 @@ class TestFleetUtil(unittest.TestCase): ...@@ -88,97 +88,6 @@ class TestFleetUtil(unittest.TestCase):
self.assertFalse(fs.need_upload_download()) self.assertFalse(fs.need_upload_download())
fleet_util._set_file_system(fs) fleet_util._set_file_system(fs)
def test_barrier(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_barrier")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.SERVER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
fleet_util.barrier("worker")
def test_all_reduce(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_all_reduce")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.WORKER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
output = fleet_util.all_reduce(1, "sum", comm_world="server")
print(output)
# self.assertEqual(output, 1)
def test_all_gather(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_all_gather")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.SERVER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._all_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
output = fleet_util.all_gather(1, comm_world="all")
print(output)
# self.assertTrue(len(output) == 1 and output[0] == 1)
self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
def download_files(self): def download_files(self):
path = download(self.proto_data_url, self.module_name, path = download(self.proto_data_url, self.module_name,
self.proto_data_md5) self.proto_data_md5)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.backward import calc_gradient
import numpy as np
class ConvBNLayer(fluid.Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
use_cudnn=False):
super(ConvBNLayer, self).__init__()
self._conv = fluid.dygraph.Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False,
use_cudnn=use_cudnn)
self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
return y
def create_program():
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
x = fluid.data(name='img', shape=[-1, 3, 224, 224])
x.stop_gradient = False
x = fluid.layers.prelu(x, mode="channel")
conv = ConvBNLayer(
num_channels=3,
num_filters=3,
filter_size=1,
act='relu',
use_cudnn=True)
y = conv(x) + x
loss = fluid.layers.reduce_sum(y)
sgd = fluid.optimizer.SGD(learning_rate=0.01)
sgd.minimize(loss)
return loss, main, startup, conv._conv.weight
class TestInplaceAddto(unittest.TestCase):
def test_result(self):
def run_program(enable_addto):
np.random.seed(10)
paddle.manual_seed(10)
paddle.framework.random._manual_program_seed(10)
if fluid.core.is_compiled_with_cuda():
fluid.set_flags({"FLAGS_cudnn_deterministic": True})
fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
loss, main, startup, w = create_program()
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
strategy = fluid.BuildStrategy()
strategy.enable_addto = enable_addto
compiled = fluid.CompiledProgram(main).with_data_parallel(
loss_name=loss.name, build_strategy=strategy)
exe.run(startup)
img = np.random.uniform(-128, 128,
[8, 3, 224, 224]).astype(np.float32)
for i in range(2):
res = exe.run(compiled,
feed={'img': img},
fetch_list=[loss.name, w.name])
return res
res1, w1 = run_program(True)
res2, w2 = run_program(False)
print(res1, res2)
self.assertTrue(np.array_equal(res1, res2))
if __name__ == "__main__":
unittest.main()
...@@ -63,28 +63,28 @@ class TestTopkOp(OpTest): ...@@ -63,28 +63,28 @@ class TestTopkOp(OpTest):
self.check_grad(set(['X']), 'Out') self.check_grad(set(['X']), 'Out')
class TestTopOp1(TestTopkOp): class TestTopkOp1(TestTopkOp):
def init_args(self): def init_args(self):
self.k = 3 self.k = 3
self.axis = 0 self.axis = 0
self.largest = True self.largest = True
class TestTopOp2(TestTopkOp): class TestTopkOp2(TestTopkOp):
def init_args(self): def init_args(self):
self.k = 3 self.k = 3
self.axis = 0 self.axis = 0
self.largest = False self.largest = False
class TestTopOp3(TestTopkOp): class TestTopkOp3(TestTopkOp):
def init_args(self): def init_args(self):
self.k = 4 self.k = 4
self.axis = 0 self.axis = 0
self.largest = False self.largest = False
class TestTopOp4(TestTopkOp): class TestTopkOp4(TestTopkOp):
def init_args(self): def init_args(self):
self.k = 4 self.k = 4
self.axis = 0 self.axis = 0
...@@ -189,6 +189,8 @@ class TestTopKAPI(unittest.TestCase): ...@@ -189,6 +189,8 @@ class TestTopKAPI(unittest.TestCase):
result1 = paddle.topk(input_tensor, k=2) result1 = paddle.topk(input_tensor, k=2)
result2 = paddle.topk(input_tensor, k=2, axis=-1) result2 = paddle.topk(input_tensor, k=2, axis=-1)
result3 = paddle.topk(input_tensor, k=k_tensor, axis=1) result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
self.assertEqual(result3[0].shape, (6, -1, 8))
self.assertEqual(result3[1].shape, (6, -1, 8))
result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False) result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False) result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
result6 = paddle.topk(large_input_tensor, k=1, axis=-1) result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
...@@ -239,6 +241,15 @@ class TestTopKAPI(unittest.TestCase): ...@@ -239,6 +241,15 @@ class TestTopKAPI(unittest.TestCase):
self.run_dygraph(place) self.run_dygraph(place)
self.run_static(place) self.run_static(place)
def test_errors(self):
paddle.disable_static()
x = paddle.to_tensor([1, 2, 3])
with self.assertRaises(BaseException):
paddle.topk(x, k=-1)
with self.assertRaises(BaseException):
paddle.topk(x, k=0)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase): ...@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
trans_output = transformer(src, tgt, src_mask, tgt_mask, trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask) memory_mask)
def test_transformer_attr_1(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None],
bias_attr=[False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_2(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None],
bias_attr=[False, False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_3(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None, None],
bias_attr=[False, False, True])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_boolean(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
bias_attr=False)
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n): ...@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
if isinstance(param_attr, (list, tuple)): if isinstance(param_attr, (list, tuple)):
assert len(param_attr) == n, ( assert len(param_attr) == n, (
"length of param_attr should be %d when it is a list/tuple" % n) "length of param_attr should be %d when it is a list/tuple" % n)
param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] param_attrs = []
for attr in param_attr:
if isinstance(attr, bool):
if attr:
param_attrs.append(ParamAttr._to_attr(None))
else:
param_attrs.append(False)
else:
param_attrs.append(ParamAttr._to_attr(attr))
# param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
elif isinstance(param_attr, bool):
param_attrs = []
if param_attr:
param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
else:
param_attrs = [False] * n
else: else:
param_attrs = [] param_attrs = []
attr = ParamAttr._to_attr(param_attr) attr = ParamAttr._to_attr(param_attr)
...@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer): ...@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used. Default: None, which means the default weight parameter property is used.
See usage for details in :code:`ParamAttr` . See usage for details in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
...@@ -986,22 +1001,31 @@ class Transformer(Layer): ...@@ -986,22 +1001,31 @@ class Transformer(Layer):
Otherwise, no pre-process and post-precess includes dropout, residual Otherwise, no pre-process and post-precess includes dropout, residual
connection, layer normalization. Default False connection, layer normalization. Default False
weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
self attention, `weight_attr[1]` would be used as `weight_attr` for `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
cross attention, and `weight_attr[2]` would be used as `weight_attr` would be used as `weight_attr` for cross attention of `TransformerDecoder`,
for linear in FFN. Otherwise, the three sub-layers all uses it as and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
`weight_attr` to create parameters. Default: None, which means the If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
default weight parameter property is used. See usage for details and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used.
See usage for details
in :code:`ParamAttr` . in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
self attention, `bias_attr[1]` would be used as `bias_attr` for `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
cross attention, and `bias_attr[2]` would be used as `bias_attr` would be used as `bias_attr` for cross attention of `TransformerDecoder`,
for linear in FFN. Otherwise, the three sub-layers all uses it as and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
`bias_attr` to create parameters. The `False` value means the If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
corresponding layer would not have trainable bias parameter. See and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
usage for details in :code:`ParamAttr` . Default: None,which means linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
the default bias parameter property is used. for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `bias_attr` to create parameters.
The `False` value means the corresponding layer would not have trainable
bias parameter. See usage for details in :code:`ParamAttr` .
Default: None,which means the default bias parameter property is used.
custom_encoder (Layer): If custom encoder is provided, use it as the encoder. custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
Default None Default None
custom_decoder (Layer): If custom decoder is provided, use it as the decoder. custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
...@@ -1049,13 +1073,51 @@ class Transformer(Layer): ...@@ -1049,13 +1073,51 @@ class Transformer(Layer):
custom_decoder=None): custom_decoder=None):
super(Transformer, self).__init__() super(Transformer, self).__init__()
if isinstance(bias_attr, (list, tuple)):
if len(bias_attr) == 1:
encoder_bias_attr = [bias_attr[0]] * 2
decoder_bias_attr = [bias_attr[0]] * 3
elif len(bias_attr) == 2:
encoder_bias_attr = bias_attr
decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
elif len(bias_attr) == 3:
encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
decoder_bias_attr = bias_attr
else:
assert False, (
"length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else:
encoder_bias_attr = bias_attr
decoder_bias_attr = bias_attr
if isinstance(weight_attr, (list, tuple)):
if len(weight_attr) == 1:
encoder_weight_attr = [weight_attr[0]] * 2
decoder_weight_attr = [weight_attr[0]] * 3
elif len(weight_attr) == 2:
encoder_weight_attr = weight_attr
decoder_weight_attr = [
weight_attr[0], weight_attr[0], weight_attr[-1]
]
elif len(weight_attr) == 3:
encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
decoder_weight_attr = weight_attr
else:
assert False, (
"length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else:
encoder_weight_attr = weight_attr
decoder_weight_attr = weight_attr
if custom_encoder is not None: if custom_encoder is not None:
self.encoder = custom_encoder self.encoder = custom_encoder
else: else:
encoder_layer = TransformerEncoderLayer( encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, d_model, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before, weight_attr, attn_dropout, act_dropout, normalize_before,
bias_attr) encoder_weight_attr, encoder_bias_attr)
encoder_norm = LayerNorm(d_model) encoder_norm = LayerNorm(d_model)
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm) encoder_norm)
...@@ -1065,8 +1127,8 @@ class Transformer(Layer): ...@@ -1065,8 +1127,8 @@ class Transformer(Layer):
else: else:
decoder_layer = TransformerDecoderLayer( decoder_layer = TransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, d_model, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before, weight_attr, attn_dropout, act_dropout, normalize_before,
bias_attr) decoder_weight_attr, decoder_bias_attr)
decoder_norm = LayerNorm(d_model) decoder_norm = LayerNorm(d_model)
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
decoder_norm) decoder_norm)
......
...@@ -282,14 +282,13 @@ class Adam(Optimizer): ...@@ -282,14 +282,13 @@ class Adam(Optimizer):
for param in self._parameter_list: for param in self._parameter_list:
if not param.trainable: if not param.trainable:
continue continue
if hasattr(
param, "_is_sparse"
) and param._is_sparse and self.regularization is not None:
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
) and self.regularization is not None:
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize( optimize_ops = self._apply_optimize(
......
...@@ -42,7 +42,7 @@ import paddle.compat as cpt ...@@ -42,7 +42,7 @@ import paddle.compat as cpt
# For more details, please refer to # For more details, please refer to
# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
# https://bugs.python.org/issue33725 # https://bugs.python.org/issue33725
if sys.version_info >= (3, 8): if sys.version_info >= (3, 8) and sys.platform == 'darwin':
fork_context = multiprocessing.get_context('fork') fork_context = multiprocessing.get_context('fork')
else: else:
fork_context = multiprocessing fork_context = multiprocessing
......
...@@ -27,8 +27,10 @@ class TestCifar10Train(unittest.TestCase): ...@@ -27,8 +27,10 @@ class TestCifar10Train(unittest.TestCase):
# long time, randomly check 1 sample # long time, randomly check 1 sample
idx = np.random.randint(0, 50000) idx = np.random.randint(0, 50000)
data, label = cifar[idx] data, label = cifar[idx]
self.assertTrue(len(data.shape) == 1) self.assertTrue(len(data.shape) == 3)
self.assertTrue(data.shape[0] == 3072) self.assertTrue(data.shape[0] == 3)
self.assertTrue(data.shape[1] == 32)
self.assertTrue(data.shape[2] == 32)
self.assertTrue(0 <= int(label) <= 9) self.assertTrue(0 <= int(label) <= 9)
...@@ -41,8 +43,10 @@ class TestCifar10Test(unittest.TestCase): ...@@ -41,8 +43,10 @@ class TestCifar10Test(unittest.TestCase):
# long time, randomly check 1 sample # long time, randomly check 1 sample
idx = np.random.randint(0, 10000) idx = np.random.randint(0, 10000)
data, label = cifar[idx] data, label = cifar[idx]
self.assertTrue(len(data.shape) == 1) self.assertTrue(len(data.shape) == 3)
self.assertTrue(data.shape[0] == 3072) self.assertTrue(data.shape[0] == 3)
self.assertTrue(data.shape[1] == 32)
self.assertTrue(data.shape[2] == 32)
self.assertTrue(0 <= int(label) <= 9) self.assertTrue(0 <= int(label) <= 9)
...@@ -55,8 +59,10 @@ class TestCifar100Train(unittest.TestCase): ...@@ -55,8 +59,10 @@ class TestCifar100Train(unittest.TestCase):
# long time, randomly check 1 sample # long time, randomly check 1 sample
idx = np.random.randint(0, 50000) idx = np.random.randint(0, 50000)
data, label = cifar[idx] data, label = cifar[idx]
self.assertTrue(len(data.shape) == 1) self.assertTrue(len(data.shape) == 3)
self.assertTrue(data.shape[0] == 3072) self.assertTrue(data.shape[0] == 3)
self.assertTrue(data.shape[1] == 32)
self.assertTrue(data.shape[2] == 32)
self.assertTrue(0 <= int(label) <= 99) self.assertTrue(0 <= int(label) <= 99)
...@@ -69,8 +75,10 @@ class TestCifar100Test(unittest.TestCase): ...@@ -69,8 +75,10 @@ class TestCifar100Test(unittest.TestCase):
# long time, randomly check 1 sample # long time, randomly check 1 sample
idx = np.random.randint(0, 10000) idx = np.random.randint(0, 10000)
data, label = cifar[idx] data, label = cifar[idx]
self.assertTrue(len(data.shape) == 1) self.assertTrue(len(data.shape) == 3)
self.assertTrue(data.shape[0] == 3072) self.assertTrue(data.shape[0] == 3)
self.assertTrue(data.shape[1] == 32)
self.assertTrue(data.shape[2] == 32)
self.assertTrue(0 <= int(label) <= 99) self.assertTrue(0 <= int(label) <= 99)
......
...@@ -103,12 +103,14 @@ class TestMNISTTest(unittest.TestCase): ...@@ -103,12 +103,14 @@ class TestMNISTTest(unittest.TestCase):
class TestMNISTTrain(unittest.TestCase): class TestMNISTTrain(unittest.TestCase):
def test_main(self): def test_main(self):
mnist = MNIST(mode='train', chw_format=False) mnist = MNIST(mode='train')
self.assertTrue(len(mnist) == 60000) self.assertTrue(len(mnist) == 60000)
for i in range(len(mnist)): for i in range(len(mnist)):
image, label = mnist[i] image, label = mnist[i]
self.assertTrue(image.shape[0] == 784) self.assertTrue(image.shape[0] == 1)
self.assertTrue(image.shape[1] == 28)
self.assertTrue(image.shape[2] == 28)
self.assertTrue(label.shape[0] == 1) self.assertTrue(label.shape[0] == 1)
self.assertTrue(0 <= int(label) <= 9) self.assertTrue(0 <= int(label) <= 9)
......
...@@ -17,6 +17,7 @@ from __future__ import print_function ...@@ -17,6 +17,7 @@ from __future__ import print_function
import six import six
import numpy as np import numpy as np
import paddle
from paddle.io import Dataset from paddle.io import Dataset
from paddle.dataset.common import _check_exists_and_download from paddle.dataset.common import _check_exists_and_download
...@@ -88,6 +89,8 @@ class UCIHousing(Dataset): ...@@ -88,6 +89,8 @@ class UCIHousing(Dataset):
# read dataset into memory # read dataset into memory
self._load_data() self._load_data()
self.dtype = paddle.get_default_dtype()
def _load_data(self, feature_num=14, ratio=0.8): def _load_data(self, feature_num=14, ratio=0.8):
data = np.fromfile(self.data_file, sep=' ') data = np.fromfile(self.data_file, sep=' ')
data = data.reshape(data.shape[0] // feature_num, feature_num) data = data.reshape(data.shape[0] // feature_num, feature_num)
...@@ -103,7 +106,8 @@ class UCIHousing(Dataset): ...@@ -103,7 +106,8 @@ class UCIHousing(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
data = self.data[idx] data = self.data[idx]
return np.array(data[:-1]), np.array(data[-1:]) return np.array(data[:-1]).astype(self.dtype), \
np.array(data[-1:]).astype(self.dtype)
def __len__(self): def __len__(self):
return len(self.data) return len(self.data)
...@@ -16,6 +16,7 @@ from .profiler import ProfilerOptions ...@@ -16,6 +16,7 @@ from .profiler import ProfilerOptions
from .profiler import Profiler from .profiler import Profiler
from .profiler import get_profiler from .profiler import get_profiler
from .deprecated import deprecated from .deprecated import deprecated
from .lazy_import import try_import
from ..fluid.framework import unique_name from ..fluid.framework import unique_name
from ..fluid.framework import load_op_library from ..fluid.framework import load_op_library
from ..fluid.framework import require_version from ..fluid.framework import require_version
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lazy imports for heavy dependencies."""
import importlib
def try_import(module_name):
"""Try importing a module, with an informative error message on failure."""
install_name = module_name
if module_name == 'cv2':
install_name = 'opencv-python'
try:
mod = importlib.import_module(module_name)
return mod
except ImportError:
err_msg = (
"Failed importing {}. This likely means that some paddle modules "
"requires additional dependencies that have to be "
"manually installed (usually with `pip install {}`). ").format(
module_name, install_name)
raise ImportError(err_msg)
...@@ -139,6 +139,7 @@ class Cifar10(Dataset): ...@@ -139,6 +139,7 @@ class Cifar10(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
image, label = self.data[idx] image, label = self.data[idx]
image = np.reshape(image, [3, 32, 32])
if self.transform is not None: if self.transform is not None:
image = self.transform(image) image = self.transform(image)
return image, label return image, label
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
import os import os
import sys import sys
import cv2
from paddle.io import Dataset from paddle.io import Dataset
from paddle.utils import try_import
__all__ = ["DatasetFolder", "ImageFolder"] __all__ = ["DatasetFolder", "ImageFolder"]
...@@ -191,6 +191,7 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', ...@@ -191,6 +191,7 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
def cv2_loader(path): def cv2_loader(path):
cv2 = try_import('cv2')
return cv2.imread(path) return cv2.imread(path)
......
...@@ -44,8 +44,6 @@ class MNIST(Dataset): ...@@ -44,8 +44,6 @@ class MNIST(Dataset):
:attr:`download` is True. Default None :attr:`download` is True. Default None
label_path(str): path to label file, can be set None if label_path(str): path to label file, can be set None if
:attr:`download` is True. Default None :attr:`download` is True. Default None
chw_format(bool): If set True, the output shape is [1, 28, 28],
otherwise, output shape is [1, 784]. Default True.
mode(str): 'train' or 'test' mode. Default 'train'. mode(str): 'train' or 'test' mode. Default 'train'.
download(bool): whether to download dataset automatically if download(bool): whether to download dataset automatically if
:attr:`image_path` :attr:`label_path` is not set. Default True :attr:`image_path` :attr:`label_path` is not set. Default True
...@@ -70,14 +68,12 @@ class MNIST(Dataset): ...@@ -70,14 +68,12 @@ class MNIST(Dataset):
def __init__(self, def __init__(self,
image_path=None, image_path=None,
label_path=None, label_path=None,
chw_format=True,
mode='train', mode='train',
transform=None, transform=None,
download=True): download=True):
assert mode.lower() in ['train', 'test'], \ assert mode.lower() in ['train', 'test'], \
"mode should be 'train' or 'test', but got {}".format(mode) "mode should be 'train' or 'test', but got {}".format(mode)
self.mode = mode.lower() self.mode = mode.lower()
self.chw_format = chw_format
self.image_path = image_path self.image_path = image_path
if self.image_path is None: if self.image_path is None:
assert download, "image_path is not set and downloading automatically is disabled" assert download, "image_path is not set and downloading automatically is disabled"
...@@ -139,10 +135,6 @@ class MNIST(Dataset): ...@@ -139,10 +135,6 @@ class MNIST(Dataset):
cols)).astype('float32') cols)).astype('float32')
offset_img += struct.calcsize(fmt_images) offset_img += struct.calcsize(fmt_images)
images = images / 255.0
images = images * 2.0
images = images - 1.0
for i in range(buffer_size): for i in range(buffer_size):
self.images.append(images[i, :]) self.images.append(images[i, :])
self.labels.append( self.labels.append(
...@@ -150,8 +142,7 @@ class MNIST(Dataset): ...@@ -150,8 +142,7 @@ class MNIST(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
image, label = self.images[idx], self.labels[idx] image, label = self.images[idx], self.labels[idx]
if self.chw_format: image = np.reshape(image, [1, 28, 28])
image = np.reshape(image, [1, 28, 28])
if self.transform is not None: if self.transform is not None:
image = self.transform(image) image = self.transform(image)
return image, label return image, label
......
...@@ -18,10 +18,11 @@ import random ...@@ -18,10 +18,11 @@ import random
import math import math
import functools import functools
import cv2
import numbers import numbers
import numpy as np import numpy as np
from paddle.utils import try_import
if sys.version_info < (3, 3): if sys.version_info < (3, 3):
Sequence = collections.Sequence Sequence = collections.Sequence
Iterable = collections.Iterable Iterable = collections.Iterable
...@@ -54,8 +55,8 @@ def flip(image, code): ...@@ -54,8 +55,8 @@ def flip(image, code):
Accordding to the code (the type of flip), flip the input image Accordding to the code (the type of flip), flip the input image
Args: Args:
image: Input image, with (H, W, C) shape image (np.ndarray): Input image, with (H, W, C) shape
code: Code that indicates the type of flip. code (int): Code that indicates the type of flip.
-1 : Flip horizontally and vertically -1 : Flip horizontally and vertically
0 : Flip vertically 0 : Flip vertically
1 : Flip horizontally 1 : Flip horizontally
...@@ -77,18 +78,28 @@ def flip(image, code): ...@@ -77,18 +78,28 @@ def flip(image, code):
# flip horizontally # flip horizontally
F.flip(fake_img, 1) F.flip(fake_img, 1)
""" """
cv2 = try_import('cv2')
return cv2.flip(image, flipCode=code) return cv2.flip(image, flipCode=code)
@keepdims @keepdims
def resize(img, size, interpolation=cv2.INTER_LINEAR): def resize(img, size, interpolation=1):
""" """
resize the input data to given size resize the input data to given size
Args: Args:
input: Input data, could be image or masks, with (H, W, C) shape input (np.ndarray): Input data, could be image or masks, with (H, W, C) shape
size: Target size of input data, with (height, width) shape. size (int|list|tuple): Target size of input data, with (height, width) shape.
interpolation: Interpolation method. interpolation (int, optional): Interpolation method.
0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -102,7 +113,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR): ...@@ -102,7 +113,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
F.resize(fake_img, (200, 150)) F.resize(fake_img, (200, 150))
""" """
cv2 = try_import('cv2')
if isinstance(interpolation, Sequence): if isinstance(interpolation, Sequence):
interpolation = random.choice(interpolation) interpolation = random.choice(interpolation)
...@@ -179,6 +190,8 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'): ...@@ -179,6 +190,8 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \ assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode) 'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
cv2 = try_import('cv2')
PAD_MOD = { PAD_MOD = {
'constant': cv2.BORDER_CONSTANT, 'constant': cv2.BORDER_CONSTANT,
'edge': cv2.BORDER_REPLICATE, 'edge': cv2.BORDER_REPLICATE,
...@@ -214,18 +227,22 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'): ...@@ -214,18 +227,22 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
@keepdims @keepdims
def rotate(img, def rotate(img, angle, interpolation=1, expand=False, center=None):
angle,
interpolation=cv2.INTER_LINEAR,
expand=False,
center=None):
"""Rotates the image by angle. """Rotates the image by angle.
Args: Args:
img (numpy.ndarray): Image to be rotated. img (numpy.ndarray): Image to be rotated.
angle (float|int): In degrees clockwise order. angle (float|int): In degrees clockwise order.
interpolation (int, optional): interpolation (int, optional): Interpolation method. Default: 1.
interpolation: Interpolation method. 0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
expand (bool|optional): Optional expansion flag. expand (bool|optional): Optional expansion flag.
If true, expands the output image to make it large enough to hold the entire rotated image. If true, expands the output image to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image. If false or omitted, make the output image the same size as the input image.
...@@ -250,8 +267,9 @@ def rotate(img, ...@@ -250,8 +267,9 @@ def rotate(img,
fake_img = rotate(fake_img, 10) fake_img = rotate(fake_img, 10)
print(fake_img.shape) print(fake_img.shape)
""" """
dtype = img.dtype cv2 = try_import('cv2')
dtype = img.dtype
h, w, _ = img.shape h, w, _ = img.shape
point = center or (w / 2, h / 2) point = center or (w / 2, h / 2)
M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1) M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
...@@ -312,6 +330,7 @@ def to_grayscale(img, num_output_channels=1): ...@@ -312,6 +330,7 @@ def to_grayscale(img, num_output_channels=1):
fake_img = to_grayscale(fake_img) fake_img = to_grayscale(fake_img)
print(fake_img.shape) print(fake_img.shape)
""" """
cv2 = try_import('cv2')
if num_output_channels == 1: if num_output_channels == 1:
img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
......
...@@ -17,7 +17,6 @@ from __future__ import division ...@@ -17,7 +17,6 @@ from __future__ import division
import math import math
import sys import sys
import random import random
import cv2
import numpy as np import numpy as np
import numbers import numbers
...@@ -26,6 +25,7 @@ import collections ...@@ -26,6 +25,7 @@ import collections
import warnings import warnings
import traceback import traceback
from paddle.utils import try_import
from . import functional as F from . import functional as F
if sys.version_info < (3, 3): if sys.version_info < (3, 3):
...@@ -214,7 +214,16 @@ class Resize(object): ...@@ -214,7 +214,16 @@ class Resize(object):
smaller edge of the image will be matched to this number. smaller edge of the image will be matched to this number.
i.e, if height > width, then image will be rescaled to i.e, if height > width, then image will be rescaled to
(size * height / width, size) (size * height / width, size)
interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR. interpolation (int, optional): Interpolation mode of resize. Default: 1.
0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
Examples: Examples:
...@@ -232,7 +241,7 @@ class Resize(object): ...@@ -232,7 +241,7 @@ class Resize(object):
print(fake_img.shape) print(fake_img.shape)
""" """
def __init__(self, size, interpolation=cv2.INTER_LINEAR): def __init__(self, size, interpolation=1):
assert isinstance(size, int) or (isinstance(size, Iterable) and assert isinstance(size, int) or (isinstance(size, Iterable) and
len(size) == 2) len(size) == 2)
self.size = size self.size = size
...@@ -252,6 +261,16 @@ class RandomResizedCrop(object): ...@@ -252,6 +261,16 @@ class RandomResizedCrop(object):
output_size (int|list|tuple): Target size of output image, with (height, width) shape. output_size (int|list|tuple): Target size of output image, with (height, width) shape.
scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0) scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33) ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
interpolation (int, optional): Interpolation mode of resize. Default: 1.
0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
Examples: Examples:
...@@ -273,7 +292,7 @@ class RandomResizedCrop(object): ...@@ -273,7 +292,7 @@ class RandomResizedCrop(object):
output_size, output_size,
scale=(0.08, 1.0), scale=(0.08, 1.0),
ratio=(3. / 4, 4. / 3), ratio=(3. / 4, 4. / 3),
interpolation=cv2.INTER_LINEAR): interpolation=1):
if isinstance(output_size, int): if isinstance(output_size, int):
self.output_size = (output_size, output_size) self.output_size = (output_size, output_size)
else: else:
...@@ -328,7 +347,16 @@ class CenterCropResize(object): ...@@ -328,7 +347,16 @@ class CenterCropResize(object):
Args: Args:
size (int|list|tuple): Target size of output image, with (height, width) shape. size (int|list|tuple): Target size of output image, with (height, width) shape.
crop_padding (int): Center crop with the padding. Default: 32. crop_padding (int): Center crop with the padding. Default: 32.
interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR. interpolation (int, optional): Interpolation mode of resize. Default: 1.
0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
Examples: Examples:
...@@ -346,7 +374,7 @@ class CenterCropResize(object): ...@@ -346,7 +374,7 @@ class CenterCropResize(object):
print(fake_img.shape) print(fake_img.shape)
""" """
def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR): def __init__(self, size, crop_padding=32, interpolation=1):
if isinstance(size, int): if isinstance(size, int):
self.size = (size, size) self.size = (size, size)
else: else:
...@@ -661,6 +689,7 @@ class ContrastTransform(object): ...@@ -661,6 +689,7 @@ class ContrastTransform(object):
if self.value == 0: if self.value == 0:
return img return img
cv2 = try_import('cv2')
dtype = img.dtype dtype = img.dtype
img = img.astype(np.float32) img = img.astype(np.float32)
alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value) alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
...@@ -701,6 +730,8 @@ class SaturationTransform(object): ...@@ -701,6 +730,8 @@ class SaturationTransform(object):
if self.value == 0: if self.value == 0:
return img return img
cv2 = try_import('cv2')
dtype = img.dtype dtype = img.dtype
img = img.astype(np.float32) img = img.astype(np.float32)
alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value) alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
...@@ -742,6 +773,7 @@ class HueTransform(object): ...@@ -742,6 +773,7 @@ class HueTransform(object):
if self.value == 0: if self.value == 0:
return img return img
cv2 = try_import('cv2')
dtype = img.dtype dtype = img.dtype
img = img.astype(np.uint8) img = img.astype(np.uint8)
hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL) hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
...@@ -1036,7 +1068,16 @@ class RandomRotate(object): ...@@ -1036,7 +1068,16 @@ class RandomRotate(object):
degrees (sequence or float or int): Range of degrees to select from. degrees (sequence or float or int): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees) clockwise order. will be (-degrees, +degrees) clockwise order.
interpolation (int|optional): Interpolation mode of resize. Default: cv2.INTER_LINEAR. interpolation (int, optional): Interpolation mode of resize. Default: 1.
0 : cv2.INTER_NEAREST
1 : cv2.INTER_LINEAR
2 : cv2.INTER_CUBIC
3 : cv2.INTER_AREA
4 : cv2.INTER_LANCZOS4
5 : cv2.INTER_LINEAR_EXACT
7 : cv2.INTER_MAX
8 : cv2.WARP_FILL_OUTLIERS
16: cv2.WARP_INVERSE_MAP
expand (bool|optional): Optional expansion flag. Default: False. expand (bool|optional): Optional expansion flag. Default: False.
If true, expands the output to make it large enough to hold the entire rotated image. If true, expands the output to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image. If false or omitted, make the output image the same size as the input image.
...@@ -1061,11 +1102,7 @@ class RandomRotate(object): ...@@ -1061,11 +1102,7 @@ class RandomRotate(object):
print(fake_img.shape) print(fake_img.shape)
""" """
def __init__(self, def __init__(self, degrees, interpolation=1, expand=False, center=None):
degrees,
interpolation=cv2.INTER_LINEAR,
expand=False,
center=None):
if isinstance(degrees, numbers.Number): if isinstance(degrees, numbers.Number):
if degrees < 0: if degrees < 0:
raise ValueError( raise ValueError(
......
opencv-python<=4.2.0.32
requests>=2.20.0 requests>=2.20.0
numpy>=1.13, <=1.16.4 ; python_version<"3.5" numpy>=1.13, <=1.16.4 ; python_version<"3.5"
numpy>=1.13 ; python_version>="3.5" numpy>=1.13 ; python_version>="3.5"
......
...@@ -238,9 +238,6 @@ if sys.version_info >= (3,7): ...@@ -238,9 +238,6 @@ if sys.version_info >= (3,7):
setup_requires_tmp+=[setup_requires_i] setup_requires_tmp+=[setup_requires_i]
setup_requires = setup_requires_tmp setup_requires = setup_requires_tmp
if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
setup_requires+=['opencv-python']
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bins = '' paddle_bins = ''
......
...@@ -39,9 +39,9 @@ fi ...@@ -39,9 +39,9 @@ fi
api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5 ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5 ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5`
if [ "$api_spec_diff" != "" ]; then if [ "$api_spec_diff" != "" ]; then
echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n" echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n" echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
check_approval 1 6888866 43953930 check_approval 1 6888866 43953930
fi fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册