提交 f52c4f8b 编写于 作者: Y yaoxuefeng6

fix conflict

......@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
......
......@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false;
} else if (num == 0) {
VLOG(0)
......@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
"characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false;
} else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false;
}
if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) {
for (int j = 0; j < num; ++j) {
strtof(endptr, &endptr);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
return false;
}
if (i + 1 != num && endptr - str == len) {
if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
} else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) {
for (int j = 0; j < num; ++j) {
strtoull(endptr, &endptr, 10);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (i + 1 != num && endptr - str == len) {
if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
......@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s",
str));
"characters.\nplease check this error line: %s, \n Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float
......@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.",
str));
"characters.\nplease check this error line: %s, \n Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]);
......@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.",
str));
"characters.\nplease check this error line: %s, \n Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float
for (int j = 0; j < num; ++j) {
......@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.",
str));
"characters.\nplease check this error line: %s, \n Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float
......
......@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
eager_deletion_pass
buffer_shared_inplace_op_pass
buffer_shared_cross_op_memory_reuse_pass
inplace_addto_op_pass
set_reader_device_info_utils
add_reader_dependency_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
......
......@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places,
const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
}
#endif
......@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), num_places,
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
platform::errors::InvalidArgument(
"The NoDummyInputSize and NoDummyOutputSize should be "
"equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
in_var_handles.size(), out_var_handles.size()));
PADDLE_ENFORCE_EQ(
local_exec_scopes_.size(), num_places,
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places;
......@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
"Variable %s is not found in local scope.",
in_var_handles[i]->name()));
auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel());
// only enforce place0, we will enforce other palce numel == place0 numel
PADDLE_ENFORCE_GT(
numel, 0, platform::errors::InvalidArgument(
"The numel of tensos=[%s] must > 0. But now numel=[%d]",
in_var_handles[i]->name(), numel));
numel, 0,
platform::errors::PreconditionNotMet(
"The numel of tensor %s should be > 0, but got numel is %d.",
in_var_handles[i]->name(), numel));
dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place());
}
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
PADDLE_ENFORCE_EQ(
numel, static_cast<int64_t>(lod_tensor.numel()),
platform::errors::PreconditionNotMet(
"The size of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(
dtype, lod_tensor.type(),
platform::errors::PreconditionNotMet(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
"in different local scopes should be equal."));
lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place());
......@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
PADDLE_ENFORCE_EQ(
in_var_handles[i]->name(), out_var_handles[i]->name(),
platform::errors::InvalidArgument(
"The name of input and output of all_reduce op should be equal, "
"but got input is %s and output is %s.",
in_var_handles[i]->name(), out_var_handles[i]->name()));
}
std::vector<std::string> grad_var_names;
......@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument(
"The nccl context should not be NULL."));
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
......@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
}
NCCLAllReduceFunc(all_reduce_calls);
#else
PADDLE_THROW("Not compiled with CUDA.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0]
......
......@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
places_(std::move(places)),
graphs_(std::move(graphs)) {
VLOG(3) << "build AsyncSSAGraphExecutor";
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
// set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
......
......@@ -19,6 +19,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -119,6 +120,9 @@ struct BuildStrategy {
// Turn on inplace by default.
bool enable_inplace_{true};
// Turn off inplace addto by default.
bool enable_addto_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
......
......@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
......@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
bootstrap_ops_.emplace_back(op);
}
}
PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
PADDLE_ENFORCE_GT(op_deps_.size(), 0,
platform::errors::PreconditionNotMet(
"The graph doesn't have operators."));
PrepareAtomicOpDeps();
}
......
......@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_handle->name());
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>();
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h"
#include <map>
#include <unordered_set>
......@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
PADDLE_ENFORCE(!use_cuda);
#endif
// skip running current op, used with inplace_addto_op_pass
if (skip_running_) {
VLOG(4) << "skip running: " << Name();
return;
}
RunImpl();
}
......
......@@ -18,6 +18,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -52,6 +53,10 @@ class OpHandleBase {
virtual Priority GetPriority() const { return kNormal; }
virtual bool GetSkipRunning() const { return skip_running_; }
virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
virtual std::string Name() const = 0;
void Run(bool use_cuda);
......@@ -131,6 +136,7 @@ class OpHandleBase {
std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_;
......
......@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
......@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
places_(places),
graphs_(std::move(graphs)),
feed_status_(places.size(), FeedStatus::kNone) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
platform::errors::InvalidArgument(
......
......@@ -13,10 +13,12 @@
// limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
......@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
var_infos_(std::move(var_infos)),
places_(std::move(places)),
scope_monitor_(places_, local_exec_scopes_) {
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
PrepareLocalExeScopes();
}
......
......@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>();
} else {
PADDLE_THROW("Variable must be type of LoDTensor");
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
}
}
......@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>();
} else {
PADDLE_THROW("Variable must be type of LoDTensor");
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
}
}
ShareTensorBufferFunctor::ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names)
const std::vector<std::string> &out_var_names, bool share_dims)
: scope_(scope),
scope_idx_(scope_idx),
op_type_(op_type),
in_var_infos_(in_var_infos),
out_var_names_(out_var_names) {
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
out_var_names_(out_var_names),
share_dims_(share_dims) {
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
platform::errors::PreconditionNotMet(
"The number of input variables and output variables "
"should be equal, but got number of input variables is "
"%d and number of output variables is %d.",
in_var_infos_.size(), out_var_names_.size()));
for (size_t i = 0; i < in_var_infos_.size(); ++i) {
AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
}
......@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
void ShareTensorBufferFunctor::AddReuseVarPair(
const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
PADDLE_ENFORCE_NOT_NULL(
in_var_info,
platform::errors::InvalidArgument(
"The input variables to be inplaced should not be NULL."));
PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
"in/out cannot have same name: %s", out_var_name);
platform::errors::InvalidArgument(
"The input variable and output variable to be inplaced "
"cannot have the same name: %s.",
out_var_name));
in_var_infos_.emplace_back(in_var_info);
out_var_names_.emplace_back(out_var_name);
}
void ShareTensorBufferFunctor::CallOnce() {
PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
PADDLE_ENFORCE(in_out_vars_.empty(),
platform::errors::InvalidArgument(
"The input-output variable pairs to be "
"inplaced should be initialized here."));
for (size_t i = 0; i < in_var_infos_.size(); ++i) {
auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NE(in_var, out_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"The output variable(%s) to be inplaced should not be NULL.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
platform::errors::PreconditionNotMet(
"The input variable and output variable to be inplaced "
"cannot be the same variable(%s).",
out_var_names_[i]));
in_out_vars_.emplace_back(in_var, out_var);
}
}
void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
if (!exec_scope_) {
PADDLE_ENFORCE_NOT_NULL(exec_scope);
PADDLE_ENFORCE_NOT_NULL(exec_scope,
platform::errors::InvalidArgument(
"The given execution scope should not be NULL "
"if the cached scope is NULL."));
exec_scope_ = exec_scope;
CallOnce();
} else {
PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
platform::errors::InvalidArgument(
"The given execution scope and the cached execution "
"scope should be the same."));
}
for (size_t i = 0; i < in_var_infos_.size(); ++i) {
......@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
} else {
out_tensor->ShareBufferWith(in_tensor);
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of
// output as the same as input.
if (share_dims_) {
out_tensor->Resize(in_tensor.dims());
}
VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
<< in_var_info->Name() << " -> " << out_var_names_[i];
}
......
......@@ -19,6 +19,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/scope.h"
......@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names);
const std::vector<std::string> &out_var_names, bool share_dims = false);
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name);
void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
void operator()(Scope *exec_scope);
std::unordered_map<std::string, std::string> ReusedVars() const;
......@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
std::vector<std::string> out_var_names_;
std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of output
// as the same as input.
bool share_dims_{false};
};
} // namespace details
......
......@@ -13,8 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include <string>
#include <unordered_set>
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
......@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
for (ir::Node *pending_op : out_var->outputs) {
auto &op = pending_op->Wrapper<OpHandleBase>();
auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
PADDLE_ENFORCE_NOT_NULL(compute_op);
PADDLE_ENFORCE_NOT_NULL(
compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be ComputationOpHandle."));
if (result_op == nullptr) {
result_op = compute_op;
} else {
PADDLE_ENFORCE_EQ(result_op, compute_op);
PADDLE_ENFORCE_EQ(
result_op, compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be the unique one."));
}
}
}
PADDLE_ENFORCE_NOT_NULL(result_op);
PADDLE_ENFORCE_NOT_NULL(result_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should not be NULL."));
return result_op;
}
ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names)
const std::vector<std::string> &out_var_names, bool share_dims)
: OpHandleBase(node),
functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
share_dims) {}
std::unordered_map<std::string, std::string>
ShareTensorBufferOpHandle::ReusedVars() const {
......@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
functor_.AddReuseVarPair(in_var_info, out_var_name);
}
void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
functor_.SetShareDims(share_dims);
}
void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
int dev_id =
......
......@@ -17,6 +17,7 @@
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
......@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
ir::Node *node, Scope *scope, size_t scope_idx,
const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
const std::vector<std::string> &out_var_names);
const std::vector<std::string> &out_var_names, bool share_dims = false);
std::unordered_map<std::string, std::string> ReusedVars() const;
......@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name);
void SetShareDims(bool share_dims);
const ShareTensorBufferFunctor &Functor() const { return functor_; }
protected:
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle {
......@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
true,
"The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle.");
platform::errors::PreconditionNotMet(
"The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle."));
for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var);
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
}
}
}
PADDLE_ENFORCE(ready_ops.empty());
PADDLE_ENFORCE_EQ(
ready_ops.empty(), true,
platform::errors::Fatal("After the execution of computation graph, "
"there are unexecuted operators left."));
}
// Wait FetchOps.
......@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
FetchResultType *fetch_data, bool return_merged) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars;
std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end());
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
......@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
ready_ops->insert(static_cast<OpHandleBase *>(op));
}
}
PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
PADDLE_ENFORCE_EQ(
local_ready_vars.size(), 0,
platform::errors::Fatal(
"The number of ready variables should be 0, but got %d.",
local_ready_vars.size()));
}
void ThreadedSSAGraphExecutor::InsertPendingOp(
......@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
}
}
op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
PADDLE_ENFORCE_GT(
op_deps_->num_ops_, 0,
platform::errors::InvalidArgument("The graph doesn't have operators."));
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
......
......@@ -14,6 +14,8 @@
#pragma once
#include <ThreadPool.h> // ThreadPool in thrird party
#include <deque>
#include <functional>
#include <list>
......@@ -24,8 +26,6 @@
#include <utility>
#include <vector>
#include <ThreadPool.h> // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
......
......@@ -54,8 +54,10 @@ struct VarHandleBase {
void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) {
PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
this->Node()->Name());
PADDLE_ENFORCE_NOT_NULL(out,
platform::errors::InvalidArgument(
"The output added to VarHandle %s is NULL.",
this->Node()->Name()));
pending_ops_.insert(out);
node_->outputs.push_back(node);
}
......@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
bool HasEvent() { return has_event_; }
const cudaEvent_t& GetEvent() {
PADDLE_ENFORCE(HasEvent(), "The event is not set.");
PADDLE_ENFORCE_EQ(
HasEvent(), true,
platform::errors::PreconditionNotMet(
"The cuda event is not set, maybe InitCUDA() is not called."));
return event_;
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace paddle {
namespace framework {
......@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>());
} else {
PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.",
ToTypeName(var->Type())));
}
}
......@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>());
} else {
PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
}
}
......@@ -50,7 +54,8 @@ struct TensorVisitor {
template <typename T>
void operator()() {
PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
PADDLE_THROW(platform::errors::Unimplemented(
"Getting tensor from type %s is not supported.", typeid(T).name()));
}
};
......@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
template <typename T>
void operator()(const T&) {
PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
typeid(T).name());
PADDLE_THROW(platform::errors::Unimplemented(
"ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
}
};
......@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
}
struct EnforceShapeAndDTypeEQVisitor {
const Variable* trg_;
const Variable* dst_;
void operator()(const LoDTensor& src) {
auto& tensor = trg_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
src.place().which(), tensor.place().which(),
"The Places of the two Variable must be all on CPU or all on GPU.");
auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
"The dtype of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
"The dims of the two Variable is not equal.");
platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal."));
PADDLE_ENFORCE_EQ(
src.dims(), tensor.dims(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
"The lod of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
"The layout of the two Variable's tensor is not equal.");
platform::errors::PreconditionNotMet(
"The lod of the two variable is not equal."));
PADDLE_ENFORCE_EQ(
src.layout(), tensor.layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors tensor is not equal."));
}
void operator()(const SelectedRows& src) {
auto& selected_rows = trg_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ(
src.place().which(), selected_rows.place().which(),
"The Places of the two Variable must be all on CPU or all on GPU.");
auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
"The dtype of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
"The layout of the two Variable's tensor is not equal.");
platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal."));
PADDLE_ENFORCE_EQ(
src.value().layout(), selected_rows.value().layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
"The height of the two Variable is not equal.");
platform::errors::PreconditionNotMet(
"The height of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
"The dims of the two Variable is not equal.");
platform::errors::PreconditionNotMet(
"The dims of the two variables is not equal."));
}
template <typename T>
void operator()(const T&) {
PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
typeid(T).name());
PADDLE_THROW(platform::errors::Unimplemented(
"EnforceShapeAndDTypeEQ is not supported for type %s.",
typeid(T).name()));
}
};
......
......@@ -19,6 +19,8 @@ limitations under the License. */
namespace gloo {
namespace rendezvous {
constexpr int kNodeSize = 136;
HdfsStore::HdfsStore(const std::string& path) {
path_ = path;
wait_sleep_ms_ = 10000;
......@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
storeKey << rank;
store.set(storeKey.str(), allBytes);
auto total_add_size = kNodeSize * (size - 1);
std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
// Connect every pair
for (uint32_t i = 0; i < connect_threads.size(); ++i) {
connect_threads[i].reset(new std::thread(
[&store, &transportContext, this](size_t thread_idx,
size_t thread_num) -> void {
[&store, &transportContext, total_add_size, this](
size_t thread_idx, size_t thread_num) -> void {
for (int i = thread_idx; i < size; i += thread_num) {
if (i == rank) {
continue;
......@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
// Wait for address of other side of this pair to become available
std::string key = std::to_string(i);
store.wait({key}, getTimeout());
std::vector<char> allAddrs;
auto max_retry_times = 5;
// Connect to other side of this pair
auto allAddrs = store.get(key);
while (max_retry_times > 0) {
allAddrs = store.get(key);
VLOG(3) << "store get all address size: " << allAddrs.size()
<< " except: " << total_add_size;
if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
break;
}
--max_retry_times;
}
auto addr = extractAddress(allAddrs, i);
transportContext->getPair(i)->connect(addr);
}
......
......@@ -18,6 +18,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("affine_channel", 0));
......@@ -18,6 +18,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("batch_norm", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("batch_norm", 0));
......@@ -11,9 +11,9 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAddActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
......@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_fuse_pass,
paddle::framework::ir::ConvElementwiseAddFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0));
......@@ -23,6 +23,8 @@
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace ir {
......@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
// Build pattern
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
->assert_is_op_input("lookup_table")
->assert_is_op_input("lookup_table_v2")
->assert_var_not_persistable();
patterns::Embedding embedding_pattern(pattern, name_scope);
// TODO(jczaja): Intermediate can only be for val that are not used anywhere
......@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(embedding_fc_lstm_fuse_pass,
paddle::framework::ir::EmbeddingFCLSTMFusePass);
REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("lookup_table_v2", 0)
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fused_embedding_fc_lstm", 0));
......@@ -18,6 +18,7 @@
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
.RequirePassAttr("use_gpu");
REGISTER_PASS_CAPABILITY(fc_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("fc", 0));
......@@ -16,6 +16,7 @@
#include <string>
#include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
auto* x_n = subgraph.at(x);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
......@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
// TODO(wilber): Support origin_mode=True.
if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
return;
}
if (with_fc_bias) {
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes.
......@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
......@@ -16,6 +16,7 @@
#include <string>
#include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
......@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass)
cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
......@@ -16,6 +16,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
......@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
VLOG(4) << "Inplace performed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString();
<< ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
} else {
VLOG(3) << "Inplace failed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name();
<< out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
}
}
}
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class InplaceAddToOpPass : public MemoryReusePass {
protected:
std::string ReuseType() const override { return "inplace_addto"; }
void Run(Graph *graph) const override;
private:
// 1. Add last living op of in_var, add any last living op of out_var
// 2. Set reference count of in_var to be 2
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const override {
size_t scope_idx = op->GetScopeIdx();
auto *last_live_ops_of_vars_ =
&Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
auto out_var_op_iter =
(*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
// In Reduce mode, some output variable(gradient of parameter) does not have
// last live ops
details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
last_live_op_of_in_var = op;
} else {
PADDLE_ENFORCE_EQ(
out_var_op_iter->second.ops().empty(), false,
platform::errors::InvalidArgument(
"Var(%s)'s last live op should not empty.", out_var->Name()));
last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
}
auto *last_live_ops_of_in_var =
(*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
// last_live_ops_of_in_var->clear();
last_live_ops_of_in_var->insert(last_live_op_of_in_var);
auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
PADDLE_ENFORCE_NE(
in_var_info_iter, (*var_infos_)[scope_idx].end(),
platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
in_var_info_iter->second->SetRefCnt(2); // before inplace, it is 1
}
};
void InplaceAddToOpPass::Run(Graph *graph) const {
const auto &last_live_ops =
Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
bool use_cuda = Get<bool>(kUseCuda);
// Currently, only perform InplaceAddToOpPass on cuda place
if (!use_cuda) {
return;
}
// Step 1: Build a reverse map of last_live_ops
// i.e.: op -> vars
std::unordered_map<details::ComputationOpHandle *,
std::unordered_map<std::string, ir::Node *>>
candidate_ops;
for (auto &each_scope_ops : last_live_ops) {
for (auto &pair : each_scope_ops) {
// If variable has more than 1 last lived ops, this variable cannot
// be inplaced.
if (pair.second.ops().size() != 1) {
continue;
}
auto *op = *(pair.second.ops().begin());
const std::string &op_type = op->GetOp()->Type();
const framework::OpDesc *op_desc = op->Node()->Op();
PADDLE_ENFORCE_NOT_NULL(
op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
op->Name()));
// only grad op should be processed.
if (op_type != "grad_add") {
continue;
}
const std::string &var_name = pair.first;
auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
if (in_nodes.size() == 1) {
candidate_ops[op][var_name] = *in_nodes.begin();
}
VLOG(4) << "Find op " << op_type << " with input(" << var_name
<< ") that can do inplace add to";
}
}
// Step 2: Check which vars can be inplaced indeed
for (auto &op_vars_pair : candidate_ops) {
auto *op = op_vars_pair.first;
// The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
// could be changed as follws if inplace addto is enabled:
// g_sum_0 = g_0
// g_sum_1 = grad_add(g_sum_0, g_1)
// g_sum_2 = grad_add(g_sum_1, g_2)
// ...
// g_sum_n = grad_add(g_sum_n-1, g_n)
// here we will add inplace for each grad_add, for example, for the first
// grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
const std::string &op_type = op->GetOp()->Type();
PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
platform::errors::InvalidArgument(
"The size of inputs of %s should be 2, but got %d",
op_type, op->Node()->inputs.size()));
PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
platform::errors::InvalidArgument(
"The size of outputs of %s should be 1, but got %d",
op_type, op->Node()->outputs.size()));
auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
out_var_ptr == nullptr) {
continue;
}
// auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
// left_var_ptr->GeneratedOp());
auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
right_var_ptr->GeneratedOp());
auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
out_var_ptr->GeneratedOp());
// NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
if (right_generated_op->Name() != "conv2d_grad") {
continue;
}
// NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
// before b. However, in the situation of inplace addto, we do not care
// the order, since a+b is equal to b+a. Is there any exception for that?
// AddDependencyVar(right_generated_op, left_generated_op);
// no need, as discussed above.
// step (a): inplace right_var->left_var of grad_add
this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
<< ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
<< ". Debug String is: "
<< right_generated_op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (b): inplace out -> right_var of grad_add
this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
VLOG(4) << "Inplace performed in op " << op_type << ": "
<< left_var_ptr->Name() << " -> " << out_var_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (c): make right_var cannot inplace afterwards. canbe done
// aotomatically since CollectReusedVars is called before any reuse.
// step (d): make right_var's generated op use addto
right_generated_op->GetOp()->SetAttr("use_addto", true);
// step (e): make grad_add skip running
op->SetSkipRunning(true);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
.RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
.RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
.RequirePassAttr(paddle::framework::ir::kUseCuda);
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include <functional>
#include <map>
#include <string>
......@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
out_var->Name()));
if (IsVarPairReusable(*in_var, *out_var)) {
AddReuseVar(op, in_var, out_var);
UpdateLastLiveOpOfVar(op, in_var, out_var);
return true;
} else {
return false;
......@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const {
details::VarHandle *out_var,
bool share_dims) const {
PADDLE_ENFORCE_GT(
(*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
......@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
share_buffer_op->AddInput(in_var);
}
if (share_dims) {
share_buffer_op->SetShareDims(true);
}
share_buffer_op->AddReuseVarPair(
(*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
out_var->Name());
reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
UpdateLastLiveOpOfVar(op, in_var, out_var);
}
// 1. Set last living op of in_var to be any last living op of out_var
......
......@@ -18,6 +18,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
......@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
int64_t GetMemorySize(const details::VarHandle &var) const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var, bool share_dims = false) const;
virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private:
VarDesc *GetVarDesc(const details::VarHandle &var) const;
......@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
void CollectReusedVars() const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var) const;
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private:
mutable Graph *graph_;
mutable bool use_cuda_;
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#define MAX_NUM_FC 10
......@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
return false;
}
if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
LOG(WARNING) << "repeated fc relu only supports input dims = 2";
return false;
}
int fc_idx = FindFCIdx(x);
if (fc_idx < 0) {
return false;
......@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(repeated_fc_relu_fuse_pass,
paddle::framework::ir::RepeatedFCReluFusePass);
REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("fc", 0)
.EQ("relu", 0));
......@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "shufflechannel_pattern";
FusePassBase::Init(pattern_name, graph);
LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
"use it instead of (reshape + transpose +reshape)";
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
......@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(shuffle_channel_detect_pass,
paddle::framework::ir::ShuffleChannelDetectPass);
REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("reshape2", 0)
.EQ("transpose2", 0));
......@@ -17,6 +17,7 @@
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
};
auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
bool basic = var_is_op_input(x, "matmul", arg_name) &&
bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
var_is_op_input(x, "matmul", arg_name)) &&
var_is_op_input(x, "square", "X");
if (!basic) {
return false;
......@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
}
auto* squared_x = squared_x_op->outputs[0];
bool next_is_matmul_from_arg =
var_is_op_input(squared_x, "matmul", arg_name) &&
(var_is_op_input(squared_x, "matmul_v2", arg_name) ||
var_is_op_input(squared_x, "matmul", arg_name)) &&
squared_x->outputs.size() == 1 &&
squared_x->outputs[0]->outputs.size() == 1;
if (!next_is_matmul_from_arg) {
......@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_first_mul_out = [=](Node* x) -> bool {
bool input_is_matmul_op = x && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() &&
x->inputs[0]->Op()->Type() == "matmul";
(x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!input_is_matmul_op) {
return false;
}
......@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_xy_op = pattern->NewNode(
[=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" &&
return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_first_mul_out(x->outputs[0]);
},
name_scope + "/matmul_xy_op");
......@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
x->inputs[0]->IsOp() &&
(x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!basic) {
return false;
}
......@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_squared_x_y_op = pattern->NewNode(
[=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" &&
return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
},
name_scope + "/matmul_squared_x_y_op");
......@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(squared_mat_sub_fuse_pass,
paddle::framework::ir::SquaredMatSubFusePass);
REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("matmul", 0)
.EQ("matmul_v2", 0)
.EQ("square", 0)
.EQ("elementwise_mul", 0)
.EQ("elementwise_sub", 0)
.EQ("fill_constant", 0)
.EQ("fusion_squared_mat_sub", 0));
......@@ -24,7 +24,7 @@ namespace framework {
namespace ir {
/**
* Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
* Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
*/
class SquaredMatSubFusePass : public FusePassBase {
public:
......
......@@ -157,6 +157,14 @@ class OperatorBase {
platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
return BOOST_GET_CONST(T, attrs_.at(name));
}
void SetAttr(const std::string& name, const Attribute& v) {
PADDLE_ENFORCE_EQ(
HasAttr(name), true,
platform::errors::NotFound(
"The attribute %s is not found in operator %s", name, Type()));
attrs_[name] = v;
}
const AttributeMap& Attrs() const { return attrs_; }
const VariableNameMap& Inputs() const { return inputs_; }
......
......@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
......@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
* them.
*/
inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
if (mem_opt_var_infos_.size() == 0) {
VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
"optimization strategy is enabled";
return;
}
auto iter = mem_opt_var_infos_[scope_idx].find(name);
if (iter != mem_opt_var_infos_[scope_idx].end()) {
iter->second->SetSkipMemoryReuse(true);
......@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
}
bool need_mem_opt = build_strategy_.enable_inplace_ ||
build_strategy_.enable_addto_ ||
build_strategy_.memory_optimize_.get() || is_gc_enabled;
if (!need_mem_opt) return graph;
......@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
graph = ref_cnt_pass->Apply(graph);
VLOG(10) << "ReferenceCountPass Applied";
if (build_strategy_.enable_addto_) {
auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
VLOG(10) << "Start to apply inplace_addto_op_pass";
graph = addto_pass->Apply(graph);
VLOG(10) << "inplace_addto_op_pass Applied";
}
if (build_strategy_.enable_inplace_) {
auto inplace_pass =
ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
......@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass);
USE_PASS(buffer_shared_inplace_pass);
USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
USE_PASS(inplace_addto_op_pass);
......@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// "seqpool_concat_fuse_pass", //
"seqpool_cvm_concat_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
// TODO(wilber): fix correctness problem.
// "fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
......
......@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
plugin::DynamicPluginTensorRT* plugin = nullptr;
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
auto use_fp16 = engine_->WithFp16();
auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
eps, use_fp16);
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
} else {
PADDLE_THROW(platform::errors::Fatal(
......
......@@ -32,13 +32,34 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000)
template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
EmbEltwiseLayernormPluginDynamicImpl<
T>::~EmbEltwiseLayernormPluginDynamicImpl() {
this->terminate();
}
inline half fp32tofp16(float x) { return static_cast<half>(x); }
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
embs_gpu_.resize(embs_.size());
for (int i = 0; i < embs_.size(); i++) {
if (embs_[i]) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
T *host_ptr;
auto size = emb_sizes_[i];
if (std::is_same<T, half>::value) {
host_ptr = new T[size];
std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
} else {
host_ptr = reinterpret_cast<T *>(embs_[i]);
}
cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
cudaMemcpyHostToDevice);
if (std::is_same<T, half>::value) {
delete[] host_ptr;
}
}
}
......@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
cudaMemcpyHostToDevice);
}
int input_num = embs_.size();
in_ptr_tensor_.Resize({input_num});
emb_ptr_tensor_.Resize({input_num});
cudaGetDevice(&device_id_);
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
cudaMemcpyHostToDevice);
return 0;
}
template <typename T>
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
for (int i = 0; i < embs_gpu_.size(); ++i) {
if (embs_gpu_[i]) {
cudaFree(embs_gpu_[i]);
embs_gpu_[i] = nullptr;
}
}
if (bias_gpu_) {
cudaFree(bias_gpu_);
bias_gpu_ = nullptr;
}
if (scale_gpu_) {
cudaFree(scale_gpu_);
scale_gpu_ = nullptr;
}
}
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims;
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
auto in_ptr_gpu_d =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
if (old_input_ptr_ != new_input_ptr) {
old_input_ptr_ = new_input_ptr;
cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
stream);
}
auto out_type = output_desc[0].type;
if (std::is_same<T, float>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (std::is_same<T, half>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
auto *output_d = reinterpret_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
return cudaGetLastError() != cudaSuccess;
}
template class EmbEltwiseLayernormPluginDynamicImpl<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamicImpl<half>;
#endif // SUPPORTS_CUDA_FP16
int EmbEltwiseLayernormPluginDynamic::initialize() {
impl_->initialize();
return 0;
}
void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { // NOLINT
PADDLE_ENFORCE_EQ(output_index, 0,
......@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret;
}
template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}
if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}
template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
......@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.",
nb_outputs));
PADDLE_ENFORCE_EQ(nb_outputs, 1,
platform::errors::InvalidArgument(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.",
nb_outputs));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
......@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
}
if (pos == all_nums - 1) {
if (sizeof(T) == sizeof(float)) {
if (with_fp16_ == false) {
return desc.type == nvinfer1::DataType::kFLOAT;
} else {
return desc.type == nvinfer1::DataType::kHALF;
......@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
return false;
}
template <typename T>
nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ(
index, 0, platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only has one input, so the "
"index value should be 0, but get %d.",
index));
return nvinfer1::DataType::kFLOAT;
if (with_fp16_)
return nvinfer1::DataType::kHALF;
else
return nvinfer1::DataType::kFLOAT;
}
template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
int EmbEltwiseLayernormPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims;
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
framework::Tensor in_ptr_tensor, emb_ptr_tensor;
int device_id;
cudaGetDevice(&device_id);
in_ptr_tensor.Resize({input_num});
emb_ptr_tensor.Resize({input_num});
int64_t *in_ptr_gpu_d =
in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
}
cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
auto out_type = output_desc[0].type;
const unsigned tpb = 256;
const dim3 grid(seq_len, batch, 1);
const dim3 block(tpb, 1, 1);
if (sizeof(T) == sizeof(float)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (sizeof(T) == sizeof(int16_t)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
T *output_d = static_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
return cudaGetLastError() != cudaSuccess;
}
template class EmbEltwiseLayernormPluginDynamic<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamic<half>;
#endif // SUPPORTS_CUDA_FP16
#endif
} // namespace plugin
......
......@@ -27,14 +27,76 @@ namespace tensorrt {
namespace plugin {
#if IS_TRT_VERSION_GE(6000)
class EmbEltwiseLayernormPluginDynamicImplBase {
public:
EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual int initialize() = 0;
virtual void terminate() = 0;
virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) = 0;
};
template <typename T>
class EmbEltwiseLayernormPluginDynamicImpl
: public EmbEltwiseLayernormPluginDynamicImplBase {
public:
explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
float* bias, float* scale,
std::vector<int> emb_sizes,
int bias_size, int scale_size,
int hidden_size, float eps)
: embs_(input_embs),
bias_(bias),
scale_(scale),
emb_sizes_(emb_sizes),
bias_size_(bias_size),
scale_size_(scale_size),
hidden_size_(hidden_size),
eps_(eps) {}
~EmbEltwiseLayernormPluginDynamicImpl();
int initialize();
void terminate();
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream);
private:
std::vector<float*> embs_;
float* bias_{nullptr};
float* scale_{nullptr};
// data on devices
float* bias_gpu_{nullptr};
float* scale_gpu_{nullptr};
std::vector<T*> embs_gpu_;
std::vector<int> emb_sizes_;
int bias_size_;
int scale_size_;
int hidden_size_;
float eps_;
framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
int device_id_{0};
uintptr_t old_input_ptr_{0};
};
class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
public:
explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
float* bias, float* scale,
std::vector<int> emb_sizes,
int bias_size, int scale_size,
int hidden_size, float eps)
int hidden_size, float eps,
bool with_fp16)
: embs_(input_embs),
bias_(bias),
scale_(scale),
......@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
bias_size_(bias_size),
scale_size_(scale_size),
hidden_size_(hidden_size),
eps_(eps) {}
eps_(eps),
with_fp16_(with_fp16),
own_host_buff_(false) {
if (with_fp16) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
}
EmbEltwiseLayernormPluginDynamic(void const* serial_data,
size_t serial_length) {
size_t serial_length)
: own_host_buff_(true) {
DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
embs_gpu_.resize(emb_sizes_.size());
embs_.resize(emb_sizes_.size());
for (size_t i = 0; i < emb_sizes_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
cudaMemcpyHostToDevice);
auto size = emb_sizes_[i];
auto ptr = new float[size];
memcpy(ptr, serial_data, sizeof(float) * size);
embs_[i] = ptr;
reinterpret_cast<char const*&>(serial_data) +=
emb_sizes_[i] * sizeof(float);
serial_length -= emb_sizes_[i] * sizeof(float);
embs_[i] = nullptr;
}
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
cudaMemcpyHostToDevice);
bias_ = nullptr;
if (bias_size_) {
bias_ = new float[bias_size_];
memcpy(bias_, serial_data, sizeof(float) * bias_size_);
}
reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
serial_length -= bias_size_ * sizeof(float);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
cudaMemcpyHostToDevice);
scale_ = nullptr;
if (scale_size_) {
scale_ = new float[scale_size_];
memcpy(scale_, serial_data, sizeof(float) * scale_size_);
}
reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
serial_length -= scale_size_ * sizeof(float);
DeserializeValue(&serial_data, &serial_length, &hidden_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
if (with_fp16_) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
eps_, with_fp16_);
return ptr;
}
......@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
}
int getNbOutputs() const override { return 1; }
int initialize() override;
void terminate() override;
size_t getSerializationSize() const override {
int sum_num = 0;
......@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
sum_num += (bias_size_ + scale_size_) * sizeof(float);
sum_num += SerializedSize(hidden_size_);
sum_num += SerializedSize(eps_);
// sum_num += SerializedSize(with_fp16_);
sum_num += SerializedSize(with_fp16_);
return sum_num;
}
void terminate() override;
void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) {
SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
auto size = emb_sizes_[i];
for (int j = 0; j < size; ++j) {
SerializeValue(&buffer, embs_[i][j]);
}
}
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
for (int i = 0; i < bias_size_; ++i) {
SerializeValue(&buffer, bias_[i]);
}
for (int i = 0; i < scale_size_; ++i) {
SerializeValue(&buffer, scale_[i]);
}
SerializeValue(&buffer, hidden_size_);
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions(
......@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
const nvinfer1::DataType* input_types,
int nb_inputs) const override;
void destroy() override { delete this; }
void destroy() override {
if (own_host_buff_) {
for (auto ptr : embs_) {
delete[] ptr;
}
delete[] bias_;
delete[] scale_;
}
delete impl_;
delete this;
}
private:
std::vector<float*> embs_;
float* bias_;
float* scale_;
// data on devices
float* bias_gpu_;
float* scale_gpu_;
std::vector<float*> embs_gpu_;
std::vector<int> emb_sizes_;
int bias_size_;
int scale_size_;
int hidden_size_;
float eps_;
bool with_fp16_;
bool own_host_buff_{false};
EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
};
class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
......@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
serial_length);
return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
}
void setPluginNamespace(const char* lib_namespace) override {
......
......@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run(config, &out_data); // serialize
run(*config_deser, &out_data); // deserialize
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
EXPECT_NEAR(result[i], out_data[i], 1e-2);
}
}
......@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result);
}
TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16
std::vector<float> result = {0.598336, 0.219558, 0.182106};
TEST(AnalysisPredictor, fp16) {
std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
trt_ernie(true, result);
#endif
}
#endif // SUPPORTS_CUDA_FP16
} // namespace inference
} // namespace paddle
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#endif
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
......@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
if (input_grad) {
// Because beta is zero, it is unnecessary to reset input_grad.
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
......@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ctx, &transformed_input_grad_channel, input_grad);
}
}
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
......@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size, &beta, args2.wdesc.desc(),
workspace_size, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size);
......@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups;
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
if (ddO) {
......
......@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
.SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual "
......@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
.SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual "
......
......@@ -54,6 +54,8 @@ class ScopedRNNBase {
x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
}
#if CUDNN_VERSION >= 7201
if (!sequence_length.empty()) {
x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
sequence_length);
......@@ -61,6 +63,7 @@ class ScopedRNNBase {
hidden_size_ * numDirections, true,
sequence_length);
}
#endif
// ------------------- cudnn hx, hy, cx, cy descriptors----------
std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
......@@ -96,10 +99,13 @@ class ScopedRNNBase {
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
cudnn_type));
#endif
#if CUDNN_VERSION >= 7201
if (!sequence_length.empty()) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
}
#endif
// ------------------- cudnn weights_size ---------------------
size_t weights_size_;
......@@ -125,8 +131,10 @@ class ScopedRNNBase {
}
cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
#if CUDNN_VERSION >= 7201
cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
#endif
cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
......@@ -151,8 +159,10 @@ class ScopedRNNBase {
platform::ScopedTensorDescriptor x_desc_;
platform::ScopedTensorDescriptor y_desc_;
#if CUDNN_VERSION >= 7201
platform::ScopedRNNTensorDescriptor x_seq_desc_;
platform::ScopedRNNTensorDescriptor y_seq_desc_;
#endif
platform::ScopedTensorDescriptor init_h_desc_;
platform::ScopedTensorDescriptor init_c_desc_;
platform::ScopedTensorDescriptor last_h_desc_;
......
......@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
......@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
int>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>);
// A specialization elementwise_add operator, used in gradient accumulation with
// inplace addto.
REGISTER_OPERATOR(
grad_add, paddle::operators::ElementwiseOp,
paddle::operators::ElementwiseAddOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
grad_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL(
grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
......@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
float>;
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
auto* scale_data = scale.data<T>();
auto* in_data = in.data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
auto in_dims = in.dims();
const int64_t channel = in_dims[quant_axis];
platform::Transform<platform::CPUDeviceContext> trans;
if (quant_axis == 0) {
const int64_t channel_size = in.numel() / channel;
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
auto* start = in_data + i * channel_size;
auto* end = in_data + (i + 1) * channel_size;
trans(ctx, start, end, out_data + i * channel_size,
ClipFunctor<T>(-s, s));
}
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
T inv_s = inverse(s);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
out_e.device(*ctx.eigen_device()) =
(bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
}
} else if (quant_axis == 1) {
const int64_t step_i = in.numel() / in_dims[0];
const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
for (int i = 0; i < in_dims[0]; i++) {
for (int j = 0; j < in_dims[1]; j++) {
T s = scale_data[j];
T inv_s = inverse(s);
auto* start = in_data + i * step_i + j * step_j;
auto* end = in_data + i * step_i + (j + 1) * step_j;
auto* cur_out_data = out_data + i * step_i + j * step_j;
trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
for (int k = 0; k < step_j; k++) {
cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
s / static_cast<T>(bin_cnt);
}
}
}
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
float>;
template <typename T>
struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
......@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
}
};
class FakeChannelWiseQuantizeDequantizeAbsMaxOp
: public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
"FakeChannelWiseQuantizeDequantizeAbsMax");
int quant_axis = ctx->Attrs().Get<int>("quant_axis");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized and dequantized low level tensor, "
"saved as float data type.");
AddOutput("OutScale", "(Tensor) Current channel wise scale");
AddAttr<int>("quant_axis",
"(int, default 0) The axis for quantization. "
"For conv2d, depthwise_conv2d, conv2d_transpose "
"and mul, the quant_axis is equal to the cout axis.")
.SetDefault(0)
.AddCustomChecker([](const int& quant_axis) {
PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
});
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
platform::errors::InvalidArgument(
"'bit_length' should be between 1 and 16, but "
"the received is %d",
bit_length));
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public:
FakeQuantizeRangeAbsMaxOp(const std::string& type,
......@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
......@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
}
};
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
float>;
// ChannelClipAndQuantDequantKernel for quant_axis is 0
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
const T* in, const T* scale, const int bin_cnt, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
// ChannelClipAndQuantDequantKernel for quant_axis is 1
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
// At present, channelwise quantization supports conv2d, depthwise_conv2d
// conv2d_transpose and mul
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis0<
T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis1<
T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
float>;
} // namespace operators
} // namespace paddle
......@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
......@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
const int quant_axis, framework::Tensor* out);
};
template <typename DeviceContext, typename T>
struct ChannelClipFakeQuantDequantFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& scale, const int bin_cnt,
const int quant_axis, framework::Tensor* out);
};
template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
......@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
int quant_axis = context.Attr<int>("quant_axis");
FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
out_scale_data);
ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
}
};
template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public:
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h"
......
......@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
REGISTER_OP_CPU_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -15,4 +15,5 @@ limitations under the License. */
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size();
const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
"the axis of topk"
......@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
if (axis < 0) axis += dim_size;
PADDLE_ENFORCE_GE(
k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
int k;
auto k_is_tensor = ctx->HasInput("K");
if (k_is_tensor) {
k = -1;
} else {
k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_EQ(k >= 1, true,
"the attribute of k in the topk must >= 1 or be a "
"Tensor, but received %d .",
k);
}
PADDLE_ENFORCE_GE(input_dims.size(), 1,
"input of topk must have >= 1d shape");
......
......@@ -294,6 +294,7 @@ class ScopedTensorDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
};
#if CUDNN_VERSION >= 7201
class ScopedRNNTensorDescriptor {
public:
ScopedRNNTensorDescriptor() {
......@@ -337,6 +338,7 @@ class ScopedRNNTensorDescriptor {
cudnnRNNDataDescriptor_t desc_;
DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
};
#endif
class ScopedDropoutDescriptor {
public:
......
......@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
......
......@@ -101,9 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnRestoreDropoutDescriptor); \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
......@@ -112,11 +109,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetTensorNdDescriptorEx);
......@@ -188,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7201
#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7401
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
......
......@@ -521,3 +521,18 @@ DEFINE_int32(
DEFINE_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of "
"the forward execution sequence.");
/**
* Performance related FLAG
* Name: max_inplace_grad_add
* Since Version: 2.0.0
* Value Range: int32, default=0
* Example:
* Note: The maximum number of inplace grad_add.
*/
DEFINE_int32(
max_inplace_grad_add, 0,
"The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that "
"less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
"instead of sum. Default is 0.");
......@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
// others
DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname);
#ifdef PADDLE_WITH_CUDA
// cudnn
......@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
#ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR(
......
......@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"fake_quantize_dequantize_moving_average_abs_max",
{"Out", "OutScale", "OutAccum", "OutState"}},
{"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"check_finite_and_unscale", {"Out", "FoundInfinite"}},
{"update_loss_scaling",
{"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <cstdlib>
#include <map>
......@@ -22,6 +23,7 @@ limitations under the License. */
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
......@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
"enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"enable_addto",
[](const BuildStrategy &self) { return self.enable_addto_; },
[](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
.def_property(
"fuse_all_reduce_ops",
[](const BuildStrategy &self) {
......
......@@ -121,6 +121,18 @@ function cmake_base() {
else
exit 1
fi
elif [ "$1" == "cp38-cp38" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
exit 1
fi
fi
# delete `gym` to avoid modifying requirements.txt in *.whl
sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
......@@ -176,6 +188,13 @@ function cmake_base() {
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
elif [ "$1" == "cp38-cp38" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
fi
else
pip install -r ${PADDLE_ROOT}/python/requirements.txt
......@@ -514,6 +533,8 @@ EOF
pip3.6 uninstall -y paddlepaddle
elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 uninstall -y paddlepaddle
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 uninstall -y paddlepaddle
fi
set -ex
......@@ -527,6 +548,8 @@ EOF
pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi
tmpfile_rand=`date +%s%N`
tmpfile=$tmp_dir/$tmpfile_rand
......@@ -666,7 +689,7 @@ function generate_api_spec() {
awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
# Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' $spec_path
sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
......@@ -1244,21 +1267,25 @@ EOF
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
fi
#ref_paddle2_mv1=""
......@@ -1363,6 +1390,22 @@ EOF
apt-get clean -y && \
rm -f ${ref_paddle37} && \
ldconfig
EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
xz-utils tk-dev libffi-dev liblzma-dev
RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
apt-get clean -y && \
rm -f ${ref_paddle38} && \
ldconfig
EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first
......
......@@ -42,6 +42,7 @@ server_num = fleet.server_num
server_index = fleet.server_index
server_endpoints = fleet.server_endpoints
is_server = fleet.is_server
set_util = fleet.set_util
util = fleet.util
barrier_worker = fleet.barrier_worker
init_worker = fleet.init_worker
......
......@@ -180,6 +180,8 @@ class Fleet(object):
raise ValueError(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}".
format(type(role_maker)))
self._role_maker._generate_role()
self.strategy_compiler = StrategyCompiler()
if paddle.fluid.framework.in_dygraph_mode():
if parallel_helper._is_parallel_ctx_initialized():
......@@ -187,7 +189,6 @@ class Fleet(object):
"The dygraph parallel environment has been initialized.")
else:
paddle.distributed.init_parallel_env()
return None
def is_first_worker(self):
"""
......@@ -206,7 +207,7 @@ class Fleet(object):
fleet.is_first_worker()
"""
return self._role_maker.is_first_worker()
return self._role_maker._is_first_worker()
def worker_index(self):
"""
......@@ -223,7 +224,7 @@ class Fleet(object):
fleet.worker_index()
"""
return self._role_maker.worker_index()
return self._role_maker._worker_index()
def worker_num(self):
"""
......@@ -240,7 +241,7 @@ class Fleet(object):
fleet.worker_num()
"""
return self._role_maker.worker_num()
return self._role_maker._worker_num()
def is_worker(self):
"""
......@@ -258,7 +259,7 @@ class Fleet(object):
fleet.is_worker()
"""
return self._role_maker.is_worker()
return self._role_maker._is_worker()
def worker_endpoints(self, to_string=False):
"""
......@@ -275,13 +276,10 @@ class Fleet(object):
fleet.worker_endpoints()
"""
'''
if to_string:
return ",".join(self._role_maker.get_trainer_endpoints())
return ",".join(self._role_maker._get_trainer_endpoints())
else:
return self._role_maker.get_trainer_endpoints()
'''
return ["127.0.0.1:1001", "127.0.0.1:1002"]
return self._role_maker._get_trainer_endpoints()
def server_num(self):
"""
......@@ -296,7 +294,7 @@ class Fleet(object):
fleet.init()
fleet.server_num()
"""
return len(self._role_maker.get_pserver_endpoints())
return len(self._role_maker._get_pserver_endpoints())
def server_index(self):
"""
......@@ -313,7 +311,7 @@ class Fleet(object):
fleet.server_index()
"""
return self._role_maker.server_index()
return self._role_maker._server_index()
def server_endpoints(self, to_string=False):
"""
......@@ -332,9 +330,9 @@ class Fleet(object):
"""
if to_string:
return ",".join(self._role_maker.get_pserver_endpoints())
return ",".join(self._role_maker._get_pserver_endpoints())
else:
return self._role_maker.get_pserver_endpoints()
return self._role_maker._get_pserver_endpoints()
def is_server(self):
"""
......@@ -352,10 +350,12 @@ class Fleet(object):
fleet.is_server()
"""
return self._role_maker.is_server(
return self._role_maker._is_server(
) or self._role_maker._is_heter_worker()
@property
def set_util(self, util):
self._util = util
def util(self):
"""
Utility functions that can be used under certain runtime
......@@ -376,16 +376,6 @@ class Fleet(object):
"""
return self._util
@util.setter
def util(self, util):
"""
Set Utility functions for userd-defined runtime
Returns:
None
"""
self._util = util
def barrier_worker(self):
"""
barrier all workers
......@@ -393,7 +383,7 @@ class Fleet(object):
Returns:
None
"""
self._role_maker.barrier_worker()
self._role_maker._barrier("worker")
@is_non_distributed_check
@inited_runtime_handler
......
......@@ -57,34 +57,7 @@ class UtilBase(object):
), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
self.fs_client = fs_client
def __check_comm_world(self, comm_world="worker"):
if not self.role_maker._role_is_generated:
self.role_maker.generate_role()
_comm_world = None
comm_world_upper = comm_world.upper()
if comm_world_upper == "WORKER":
if not self.role_maker.is_worker():
print(
"warning: current role is not worker in collective_func(comm_world=\"worker\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "SERVER":
if not self.role_maker.is_server():
print(
"warning: current role is not server in collective_func(comm_world=\"server\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "ALL":
_comm_world = self.role_maker._all_comm
else:
raise ValueError(
"not support comm_world, please choose one from [worker, server, all]"
)
return _comm_world
def all_reduce(self, input, mode, comm_world="worker"):
def all_reduce(self, input, mode="sum", comm_world="worker"):
"""
All reduce `input` between specified collection. This is a distributed API.
......@@ -130,8 +103,7 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world = self.__check_comm_world(comm_world)
return self.role_maker._all_reduce(_comm_world, input, mode)
return self.role_maker._all_reduce(input, mode, comm_world)
def barrier(self, comm_world="worker"):
"""
......@@ -170,8 +142,7 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world = self.__check_comm_world(comm_world)
self.role_maker._barrier(_comm_world)
self.role_maker._barrier(comm_world)
def all_gather(self, input, comm_world="worker"):
"""
......@@ -219,8 +190,8 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world = self.__check_comm_world(comm_world)
return self.role_maker._all_gather(_comm_world, input)
return self.role_maker._all_gather(input, comm_world)
def _broadcast(self):
pass
......@@ -266,8 +237,8 @@ class UtilBase(object):
if not isinstance(files, list):
raise TypeError("files should be a list of file need to be read.")
trainer_id = self.role_maker.worker_index()
trainers = self.role_maker.worker_num()
trainer_id = self.role_maker._worker_index()
trainers = self.role_maker._worker_num()
remainder = len(files) % trainers
blocksize = int(len(files) / trainers)
......@@ -309,7 +280,7 @@ class UtilBase(object):
fleet_util._set_role_maker(role)
fleet_util.print_on_rank("I'm worker 0", 0)
"""
if self.role_maker.worker_index() != rank_id:
if self.role_maker._worker_index() != rank_id:
return
print(message)
......
......@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
"""
from __future__ import print_function
import shutil
import sys
import tempfile
from sys import version
import subprocess
import os
......@@ -213,12 +216,20 @@ def launch_collective(args):
cluster, pod = get_cluster_from_args(args, gpus)
logger.debug("get cluster from args:{}".format(cluster))
global_envs = copy.copy(os.environ.copy())
gloo_rendezvous_dir = tempfile.mkdtemp()
# add gloo env
global_envs["PADDLE_WITH_GLOO"] = "1"
global_envs["PADDLE_GLOO_RENDEZVOUS"] = "2"
global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
procs = start_local_trainers(
cluster,
pod,
training_script=args.training_script,
training_script_args=args.training_script_args,
log_dir=args.log_dir)
log_dir=args.log_dir,
envs=global_envs)
while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
......@@ -230,6 +241,9 @@ def launch_collective(args):
time.sleep(3)
if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir)
def launch_ps(args):
ports = None
......@@ -315,6 +329,13 @@ def launch_ps(args):
default_env = os.environ.copy()
current_env = copy.copy(default_env)
gloo_rendezvous_dir = tempfile.mkdtemp()
# add gloo env
current_env["PADDLE_WITH_GLOO"] = "1"
current_env["PADDLE_GLOO_RENDEZVOUS"] = "2"
current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
procs = []
......@@ -419,6 +440,9 @@ def launch_ps(args):
procs[i].proc.terminate()
print("all parameter server are killed", file=sys.stderr)
if os.path.exists(gloo_rendezvous_dir):
shutil.rmtree(gloo_rendezvous_dir)
def launch():
args = _parse_args()
......
......@@ -398,8 +398,14 @@ def start_local_trainers(cluster,
pod,
training_script,
training_script_args,
log_dir=None):
current_env = copy.copy(os.environ.copy())
log_dir=None,
envs=None):
if envs is None:
current_env = copy.copy(os.environ.copy())
else:
current_env = copy.copy(envs)
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
......
......@@ -57,12 +57,12 @@ class CollectiveHelper(object):
if startup_program is None:
self.startup_program = fluid.default_startup_program()
endpoints = self.role_maker.get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()]
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
for ring_id in range(self.nrings):
self._init_communicator(
self.startup_program, current_endpoint, endpoints,
self.role_maker.worker_index(), ring_id, self.wait_port)
self.role_maker._worker_index(), ring_id, self.wait_port)
self._broadcast_params()
def _init_communicator(self, program, current_endpoint, endpoints, rank,
......
......@@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase):
sparsity=configs['sparsity'],
parameter_list=opt._parameter_list,
use_nesterov=opt._use_nesterov,
num_trainers=self.role_maker.worker_num(),
num_trainers=self.role_maker._worker_num(),
regularization=opt.regularization,
grad_clip=opt._grad_clip,
name=opt._name)
......@@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase):
if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer")
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
logging.warn("dgc only works on multi cards")
return False
......
......@@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints()
trainer_endpoints = self.role_maker._get_trainer_endpoints()
trainers = trainer_endpoints
trainer_id = self.role_maker.worker_index()
current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
trainer_id = self.role_maker._worker_index()
current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
trainer_endpoints_env = ",".join(trainer_endpoints)
trainers_num = self.role_maker.worker_num()
trainers_num = self.role_maker._worker_num()
nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
for i in range(1, build_strategy.nccl_comm_num):
......@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
local_build_strategy.enable_sequential_execution = True
exe_strategy = self.user_defined_strategy.execution_strategy
worker_num = self.role_maker.worker_num()
node_num = self.role_maker.node_num()
worker_num = self.role_maker._worker_num()
node_num = self.role_maker._node_num()
if self.role_maker._is_collective:
assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
......@@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# TODO(guru4elephant): should be an independent optimizer
self._setup_nccl_op(startup_program, main_program, local_build_strategy)
local_build_strategy.num_trainers = self.role_maker.worker_num()
local_build_strategy.trainer_id = self.role_maker.worker_index()
local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
local_build_strategy.num_trainers = self.role_maker._worker_num()
local_build_strategy.trainer_id = self.role_maker._worker_index()
local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
)
local_build_strategy.enable_backward_optimizer_op_deps = True
......
......@@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.localsgd:
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
......@@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [param]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
sub_block.append_op(
......@@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.adaptive_localsgd:
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
......@@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [avg_loss]},
outputs={'Out': [avg_loss]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
......@@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [param]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
sub_block.append_op(
......
......@@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
if k_steps < 0:
return False
if self.role_maker.is_server():
if self.role_maker._is_server():
return False
if self.role_maker._is_heter_parameter_server_mode:
......
......@@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase):
strategy, self.role_maker)
compiled_config.strategy = strategy
if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
main_program, startup_program = self._build_trainer_programs(
compiled_config)
elif self.role_maker.is_server():
elif self.role_maker._is_server():
main_program, startup_program = self._build_pserver_programs(
compiled_config)
......
......@@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
if self.role_maker.worker_num() == 1:
if self.role_maker._worker_num() == 1:
return optimize_ops, params_grads
endpoints = self.role_maker.get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()]
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
self.startup_program = startup_program
if startup_program is None:
self.startup_program = fluid.default_startup_program()
......@@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
self.nranks = nranks
self.nrings = len(self.main_program_list)
self.rank = self.role_maker.worker_index()
self.rank = self.role_maker._worker_index()
self.endpoints = endpoints
self.current_endpoint = current_endpoint
......
......@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
def _init_worker(self):
def sync_strategy_envs():
kwargs = {}
kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
)
kwargs["trainer_id"] = self.role_maker.worker_index()
kwargs[
"pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
kwargs["trainer_id"] = self.role_maker._worker_index()
return kwargs
def geo_strategy_envs():
......@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
return "#".join(init_attrs)
kwargs = {}
kwargs["trainers"] = self.role_maker.worker_num()
kwargs["trainers"] = self.role_maker._worker_num()
kwargs["sparse_attrs"] = get_sparse_attrs()
return kwargs
......@@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes":
[",".join([str(i) for i in var.shape])],
......@@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes": slice_shapes,
"slice_varnames": var_ctx.split_varnames(),
"remote_varnames": var_ctx.split_varnames(),
"is_sparse": True,
"endpoints": var_ctx.split_endpoints(),
"pserver_num": len(self.role_maker.get_pserver_endpoints()),
"pserver_num":
len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name)
})
......@@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes": slice_shapes,
"slice_varnames": slice_varnames,
......@@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
"is_sparse": True,
"endpoints": var_ctx.split_endpoints(),
"pserver_num":
len(self.role_maker.get_pserver_endpoints()),
len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name)
})
......@@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes":
[",".join([str(i) for i in var.shape])],
......
......@@ -197,6 +197,7 @@ def __bootstrap__():
'free_when_no_cache_hit',
'call_stack_level',
'sort_sum_gradient',
'max_inplace_grad_add',
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
......
......@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
begin_idx = 0
if end_idx is None:
end_idx = len(op_descs)
for i in range(begin_idx, end_idx):
op_desc = op_descs[i]
if isinstance(op_desc, tuple):
op_desc = op_desc[0]
op_desc._rename_input(old_name, new_name)
op_desc._rename_output(old_name, new_name)
if isinstance(op_descs, (list, tuple)):
for i in range(begin_idx, end_idx):
op_desc = op_descs[i]
if isinstance(op_desc, tuple):
op_desc = op_desc[0]
op_desc._rename_input(old_name, new_name)
op_desc._rename_output(old_name, new_name)
if isinstance(op_descs, collections.OrderedDict):
for key, value in op_descs.items():
if isinstance(value, (list, tuple)):
for op_desc in value:
op_desc._rename_input(old_name, new_name)
op_desc._rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs):
......@@ -369,6 +376,41 @@ def _append_grad_suffix_(name):
return cpt.to_text(name) + core.grad_var_suffix()
def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
op_idx):
"""
Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
"""
if op_idx not in pending_sum_ops.keys():
pending_sum_ops[op_idx] = []
pending_sum_ops[op_idx].append(
_create_op_desc_("sum", {"X": renamed_vars[var_name]},
{"Out": [var_name]}, {"use_mkldnn": False}))
renamed_vars[var_name] = [var_name]
def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
op_idx):
"""
Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
"""
if op_idx not in pending_sum_ops.keys():
pending_sum_ops[op_idx] = []
out_name = renamed_vars[var_name][0]
for i in range(1, len(renamed_vars[var_name])):
x_name = out_name
y_name = renamed_vars[var_name][i]
if i != len(renamed_vars[var_name]) - 1:
out_name = var_name + '@ADD@' + str(i)
else:
out_name = var_name
pending_sum_ops[op_idx].append(
_create_op_desc_("grad_add", {"X": [x_name],
"Y": [y_name]}, {"Out": [out_name]},
{"use_mkldnn": False}))
renamed_vars[var_name] = [var_name]
def _addup_repetitive_outputs_(op_descs, block_idx):
"""
In backward part, an variable may be the output of more than one ops.
......@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
In these cases, the variable should be the accumulation of all the outputs.
`sum_op`s are added to implement the accumulate.
"""
pending_sum_ops = []
_MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
#pending_sum_ops = []
pending_sum_ops = collections.OrderedDict()
var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list)
renamed_var_start_idx = collections.defaultdict(list)
......@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
if "@GRAD" not in var_name:
continue
if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append((_create_op_desc_(
"sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
{"use_mkldnn": False}), idx))
renamed_vars[var_name] = [var_name]
if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
_accumulate_gradients_by_sum_op_(var_name, renamed_vars,
pending_sum_ops, idx)
else:
_accumulate_gradients_by_add_ops_(var_name, renamed_vars,
pending_sum_ops, idx)
for param_idx, param_name in enumerate(op_desc.output_names()):
arg_names = op_desc.output(param_name)
for arg_idx, var_name in enumerate(arg_names):
......@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
renamed_vars[var_name].append(new_name)
for var_name, inputs in six.iteritems(renamed_vars):
if len(inputs) > 1:
pending_sum_ops.append(
(_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
{"use_mkldnn": False}), len(op_descs)))
if len(renamed_vars[var_name]) > 1:
if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
_accumulate_gradients_by_sum_op_(var_name, renamed_vars,
pending_sum_ops, len(op_descs))
else:
_accumulate_gradients_by_add_ops_(var_name, renamed_vars,
pending_sum_ops,
len(op_descs))
# sum_op descs are sorted according to their insert position
for p in reversed(pending_sum_ops):
op_descs.insert(p[1], p[0])
for key, value in collections.OrderedDict(
reversed(list(pending_sum_ops.items()))).items():
# NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct.
# For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b'].
# If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b'].
# If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
idx = key
for i, op in enumerate(value):
op_descs.insert(idx + i, op)
return op_descs
......
......@@ -99,7 +99,12 @@ class ImperativeQuantAware(object):
self._activation_bits = activation_bits
self._moving_rate = moving_rate
quant_type = {'abs_max', 'moving_average_abs_max'}
quant_type = {
'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
}
assert activation_quantize_type != 'channel_wise_abs_max', \
"The activation quantization type does not support 'channel_wise_abs_max'."
if activation_quantize_type not in quant_type:
raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be "
......@@ -108,8 +113,8 @@ class ImperativeQuantAware(object):
if weight_quantize_type not in quant_type:
raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be "
"'abs_max' or 'moving_average_abs_max' now." %
(str(weight_quantize_type)))
"'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
% (str(weight_quantize_type)))
self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type
......
......@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype
__all__ = [
'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
'QuantizedLinear'
'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
]
......@@ -209,6 +209,89 @@ class FakeQuantAbsMax(layers.Layer):
return quant_out
class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
def __init__(self,
name=None,
channel_num=None,
quant_bits=8,
quant_axis=0,
dtype='float32',
quant_on_weight=False):
assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
self._quant_bits = quant_bits
self._quant_axis = quant_axis
self._dtype = dtype
self._name = name
self._channel_num = channel_num
scale_prefix = "{}.scale".format(
name) if name else 'quant_dequant.scale'
self._scale_name = unique_name.generate(scale_prefix)
if quant_on_weight:
scale_attr = ParamAttr(
name=self._scale_name,
initializer=Constant(0.0),
trainable=False)
self._scale = self.create_parameter(
shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
self._scale.stop_gradient = True
else:
self._scale = None
def forward(self, input):
if in_dygraph_mode():
attrs = ('bit_length', self._quant_bits, 'quant_axis',
self._quant_axis)
quant_out = _varbase_creator(
type=input.type,
name="{}.quantized.dequantized".format(input.name),
shape=input.shape,
dtype=input.dtype,
persistable=False)
out_scale = self._scale
if out_scale is None:
out_scale = _varbase_creator(
type=core.VarDesc.VarType.LOD_TENSOR,
name=self._scale_name,
shape=[self._channel_num],
dtype=self._dtype,
persistable=False)
out_scale.stop_gradient = True
out, _, = core.ops.fake_channel_wise_quantize_dequantize_abs_max(
input, quant_out, out_scale, *attrs)
return out
check_variable_and_dtype(input, 'input', ['float32'],
"FakeChannelWiseQuantDequantAbsMax")
attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
inputs = {"X": [input]}
quant_out = self._helper.create_variable(
name="{}.quantized.dequantized".format(input.name),
dtype=input.dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
out_scale = self._scale
if not out_scale:
out_scale = self._helper.create_variable(
name=self._scale_name,
dtype=self._dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=True)
outputs = {"Out": [quant_out], "OutScale": [out_scale]}
self._helper.append_op(
type="fake_channel_wise_quantize_dequantize_abs_max",
inputs=inputs,
outputs=outputs,
attrs=attrs)
return quant_out
def _get_fake_quant_type(quant_type, **kwargs):
call_args = {
"name": kwargs.get("name", None),
......@@ -220,10 +303,17 @@ def _get_fake_quant_type(quant_type, **kwargs):
call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
elif quant_type == 'moving_average_abs_max':
call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
elif quant_type == 'channel_wise_abs_max':
call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
call_args["channel_num"] = kwargs.get("channel_num", None)
call_args["quant_axis"] = kwargs.get("quant_axis", 0)
assert call_args["channel_num"] is not None, (
"You need to input channel_num"
"when you use channel_wise_abs_max strategy.")
fake_quant_map = {
'abs_max': FakeQuantAbsMax,
'moving_average_abs_max': FakeQuantMovingAverage
'moving_average_abs_max': FakeQuantMovingAverage,
'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
}
return fake_quant_map[quant_type](**call_args)
......@@ -255,19 +345,23 @@ class QuantizedConv2D(layers.Layer):
self.weight = getattr(layer, 'weight')
self.bias = getattr(layer, 'bias')
# For FakeQuant
self._conv2d_quant_axis = 0
self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type,
name=self.weight.name,
moving_rate=moving_rate,
quant_bits=weight_bits,
dtype=self._dtype,
quant_on_weight=True)
quant_on_weight=True,
channel_num=self.weight.shape[self._conv2d_quant_axis],
quant_axis=self._conv2d_quant_axis)
self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type,
name=layer.full_name(),
moving_rate=moving_rate,
quant_bits=activation_bits,
dtype=self._dtype)
dtype=self._dtype,
quant_on_weight=False)
def forward(self, input):
quant_input = self._fake_quant_input(input)
......@@ -341,19 +435,23 @@ class QuantizedLinear(layers.Layer):
self.weight = getattr(layer, 'weight')
self.bias = getattr(layer, 'bias')
# For FakeQuant
self._linear_quant_axis = 1
self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type,
name=self.weight.name,
moving_rate=moving_rate,
quant_bits=weight_bits,
dtype=self._dtype,
quant_on_weight=True)
quant_on_weight=True,
channel_num=self.weight.shape[self._linear_quant_axis],
quant_axis=self._linear_quant_axis)
self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type,
name=layer.full_name(),
moving_rate=moving_rate,
quant_bits=activation_bits,
dtype=self._dtype)
dtype=self._dtype,
quant_on_weight=False)
def forward(self, input):
quant_input = self._fake_quant_input(input)
......
......@@ -181,7 +181,6 @@ class TestImperativeQat(unittest.TestCase):
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
out = lenet(img)
acc = fluid.layers.accuracy(out, label)
loss = fluid.layers.cross_entropy(out, label)
......
......@@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
return trainer.mode == DistributedMode.ASYNC
def get_role_id(self):
return self.role_maker.role_id()
try:
return self.role_maker._role_id()
except Exception:
return self.role_maker.role_id()
def get_trainers(self):
return self.role_maker.worker_num()
try:
return self.role_maker._worker_num()
except Exception:
return self.role_maker.worker_num()
def get_ps_endpoint(self):
return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
try:
return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
except Exception:
return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
def get_ps_endpoints(self):
return self.role_maker.get_pserver_endpoints()
try:
return self.role_maker._get_pserver_endpoints()
except Exception:
return self.role_maker.get_pserver_endpoints()
def get_heter_worker_endpoints(self):
return self.role_maker._get_heter_worker_endpoints()
try:
return self.role_maker._get_heter_worker_endpoints()
except Exception:
return self.role_maker.get_heter_worker_endpoints()
def get_heter_worker_endpoint(self):
return self.role_maker._get_heter_worker_endpoint()
try:
return self.role_maker._get_heter_worker_endpoint()
except Exception:
return self.role_maker.get_heter_worker_endpoint()
def get_origin_programs(self):
return self.origin_main_program, self.origin_startup_program
......
......@@ -680,8 +680,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
if not isinstance(value, Variable):
if dtype in ['int64', 'int32']:
attrs['str_value'] = str(int(value))
attrs['value'] = int(value)
else:
attrs['str_value'] = str(float(value))
attrs['value'] = float(value)
if in_dygraph_mode():
shape = utils.convert_shape_to_list(shape)
......
......@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv, elementwise_add and 2 act."""
......@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_elementwise_add2_act_fuse_pass'))
if __name__ == "__main__":
......
......@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv, elementwise_add and act."""
......@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible(
'conv_elementwise_add_act_fuse_pass'))
if __name__ == "__main__":
......
......@@ -19,6 +19,7 @@ import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig
"""Test for fusion of conv and elementwise_add."""
......@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
class MulLstmFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
dict_dim, emb_dim = 128, 64
hidden_dim = 512
data = fluid.data(
name='data', shape=[1], dtype='int64', lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
forward, cell = fluid.layers.dynamic_lstm(
input=x, size=hidden_dim * 4)
batch = 16
lod_tensor = fluid.LoDTensor()
lod_tensor.set(np.random.randint(
0, dict_dim, size=[batch]).astype("int64"),
fluid.CPUPlace())
lod_tensor.set_lod([[0, batch]])
self.feeds = {"data": lod_tensor}
self.fetch_list = [forward, cell]
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
self.assertTrue(PassVersionChecker.IsCompatible('mul_lstm_fuse_pass'))
if __name__ == "__main__":
unittest.main()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册