提交 88031aa0 编写于 作者: Z zhhsplendid

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refine_api2, test=develop

...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable) ...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50") set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
......
...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
VLOG(0) << "error: the number of ids is a negative number: " << num; VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (num == 0) { } else if (num == 0) {
VLOG(0) VLOG(0)
...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
"characters."; "characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (errno == ERANGE || num > INT_MAX) { } else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX"; VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} }
if (all_slots_type_[i] == "float") { if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtof(endptr, &endptr); strtof(endptr, &endptr);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for float"; "representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
} }
} else if (all_slots_type_[i] == "uint64") { } else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtoull(endptr, &endptr, 10); strtoull(endptr, &endptr, 10);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t"; "representable values for uint64_t";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( ...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float if ((*instance)[idx].GetType()[0] == 'f') { // float
...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) { ...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { ...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) { ...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
......
...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto ...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
eager_deletion_pass eager_deletion_pass
buffer_shared_inplace_op_pass buffer_shared_inplace_op_pass
buffer_shared_cross_op_memory_reuse_pass buffer_shared_cross_op_memory_reuse_pass
inplace_addto_op_pass
set_reader_device_info_utils set_reader_device_info_utils
add_reader_dependency_pass) add_reader_dependency_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLCommunicator *ctxs) const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) { : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#else #else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places) const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) { : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#endif #endif
...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles, const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) { const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size(); size_t num_places = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
in_var_handles.size(), num_places, platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal to the number of places."); "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places); "The NoDummyInputSize and NoDummyOutputSize should be "
"equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
in_var_handles.size(), out_var_handles.size()));
PADDLE_ENFORCE_EQ(
local_exec_scopes_.size(), num_places,
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places; std::vector<platform::Place> places;
...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i]; auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name()); auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.", PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
in_var_handles[i]->name()); "Variable %s is not found in local scope.",
in_var_handles[i]->name()));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) { if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel()); numel = static_cast<int64_t>(lod_tensor.numel());
// only enforce place0, we will enforce other palce numel == place0 numel // only enforce place0, we will enforce other palce numel == place0 numel
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
numel, 0, platform::errors::InvalidArgument( numel, 0,
"The numel of tensos=[%s] must > 0. But now numel=[%d]", platform::errors::PreconditionNotMet(
in_var_handles[i]->name(), numel)); "The numel of tensor %s should be > 0, but got numel is %d.",
in_var_handles[i]->name(), numel));
dtype = lod_tensor.type(); dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place()); is_gpu_place = platform::is_gpu_place(lod_tensor.place());
} }
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type()); numel, static_cast<int64_t>(lod_tensor.numel()),
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place())); platform::errors::PreconditionNotMet(
"The size of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(
dtype, lod_tensor.type(),
platform::errors::PreconditionNotMet(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
"in different local scopes should be equal."));
lod_tensor_data.emplace_back(lod_tensor.data<void>()); lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place()); places.emplace_back(lod_tensor.place());
...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name(); << ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), PADDLE_ENFORCE_EQ(
"The name of input and output should be equal."); in_var_handles[i]->name(), out_var_handles[i]->name(),
platform::errors::InvalidArgument(
"The name of input and output of all_reduce op should be equal, "
"but got input is %s and output is %s.",
in_var_handles[i]->name(), out_var_handles[i]->name()));
} }
std::vector<std::string> grad_var_names; std::vector<std::string> grad_var_names;
...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument(
"The nccl context should not be NULL."));
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
} }
NCCLAllReduceFunc(all_reduce_calls); NCCLAllReduceFunc(all_reduce_calls);
#else #else
PADDLE_THROW("Not compiled with CUDA."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { // Special handle CPU only Operator's gradient. Like CRF } else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0] auto &trg = *local_exec_scopes_[0]
......
...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( ...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
places_(std::move(places)), places_(std::move(places)),
graphs_(std::move(graphs)) { graphs_(std::move(graphs)) {
VLOG(3) << "build AsyncSSAGraphExecutor"; VLOG(3) << "build AsyncSSAGraphExecutor";
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size() strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "boost/optional.hpp" #include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
...@@ -119,6 +120,9 @@ struct BuildStrategy { ...@@ -119,6 +120,9 @@ struct BuildStrategy {
// Turn on inplace by default. // Turn on inplace by default.
bool enable_inplace_{true}; bool enable_inplace_{true};
// Turn off inplace addto by default.
bool enable_addto_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if // num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model. // it's distributed model.
......
...@@ -12,12 +12,14 @@ ...@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
bootstrap_ops_.emplace_back(op); bootstrap_ops_.emplace_back(op);
} }
} }
PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(op_deps_.size(), 0,
platform::errors::PreconditionNotMet(
"The graph doesn't have operators."));
PrepareAtomicOpDeps(); PrepareAtomicOpDeps();
} }
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() { ...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]); auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx()); auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name()); auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", PADDLE_ENFORCE_NOT_NULL(
var_handle->name()); var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>(); auto &t = var->Get<framework::LoDTensor>();
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include <map> #include <map>
#include <unordered_set> #include <unordered_set>
...@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE(!use_cuda);
#endif #endif
// skip running current op, used with inplace_addto_op_pass
if (skip_running_) {
VLOG(4) << "skip running: " << Name();
return;
}
RunImpl(); RunImpl();
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -52,6 +53,10 @@ class OpHandleBase { ...@@ -52,6 +53,10 @@ class OpHandleBase {
virtual Priority GetPriority() const { return kNormal; } virtual Priority GetPriority() const { return kNormal; }
virtual bool GetSkipRunning() const { return skip_running_; }
virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
virtual std::string Name() const = 0; virtual std::string Name() const = 0;
void Run(bool use_cuda); void Run(bool use_cuda);
...@@ -131,6 +136,7 @@ class OpHandleBase { ...@@ -131,6 +136,7 @@ class OpHandleBase {
std::map<platform::Place, platform::DeviceContext *> dev_ctxes_; std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
std::vector<Scope *> local_exec_scopes_; std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_; std::unordered_map<int, cudaEvent_t> events_;
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
places_(places), places_(places),
graphs_(std::move(graphs)), graphs_(std::move(graphs)),
feed_status_(places.size(), FeedStatus::kNone) { feed_status_(places.size(), FeedStatus::kNone) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(), PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
var_infos_(std::move(var_infos)), var_infos_(std::move(var_infos)),
places_(std::move(places)), places_(std::move(places)),
scope_monitor_(places_, local_exec_scopes_) { scope_monitor_(places_, local_exec_scopes_) {
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
PrepareLocalExeScopes(); PrepareLocalExeScopes();
} }
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) { ...@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>(); return var->Get<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
...@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) { ...@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>(); return var->GetMutable<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
ShareTensorBufferFunctor::ShareTensorBufferFunctor( ShareTensorBufferFunctor::ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: scope_(scope), : scope_(scope),
scope_idx_(scope_idx), scope_idx_(scope_idx),
op_type_(op_type), op_type_(op_type),
in_var_infos_(in_var_infos), in_var_infos_(in_var_infos),
out_var_names_(out_var_names) { out_var_names_(out_var_names),
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); share_dims_(share_dims) {
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
platform::errors::PreconditionNotMet(
"The number of input variables and output variables "
"should be equal, but got number of input variables is "
"%d and number of output variables is %d.",
in_var_infos_.size(), out_var_names_.size()));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
AddReuseVarPair(in_var_infos_[i], out_var_names_[i]); AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
} }
...@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const { ...@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
void ShareTensorBufferFunctor::AddReuseVarPair( void ShareTensorBufferFunctor::AddReuseVarPair(
const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) { const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); PADDLE_ENFORCE_NOT_NULL(
in_var_info,
platform::errors::InvalidArgument(
"The input variables to be inplaced should not be NULL."));
PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
"in/out cannot have same name: %s", out_var_name); platform::errors::InvalidArgument(
"The input variable and output variable to be inplaced "
"cannot have the same name: %s.",
out_var_name));
in_var_infos_.emplace_back(in_var_info); in_var_infos_.emplace_back(in_var_info);
out_var_names_.emplace_back(out_var_name); out_var_names_.emplace_back(out_var_name);
} }
void ShareTensorBufferFunctor::CallOnce() { void ShareTensorBufferFunctor::CallOnce() {
PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); PADDLE_ENFORCE(in_out_vars_.empty(),
platform::errors::InvalidArgument(
"The input-output variable pairs to be "
"inplaced should be initialized here."));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name()); auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_NOT_NULL(out_var); in_var, platform::errors::NotFound(
PADDLE_ENFORCE_NE(in_var, out_var); "The input variable(%s)to be inplaced should not be NULL.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"The output variable(%s) to be inplaced should not be NULL.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
platform::errors::PreconditionNotMet(
"The input variable and output variable to be inplaced "
"cannot be the same variable(%s).",
out_var_names_[i]));
in_out_vars_.emplace_back(in_var, out_var); in_out_vars_.emplace_back(in_var, out_var);
} }
} }
void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
if (!exec_scope_) { if (!exec_scope_) {
PADDLE_ENFORCE_NOT_NULL(exec_scope); PADDLE_ENFORCE_NOT_NULL(exec_scope,
platform::errors::InvalidArgument(
"The given execution scope should not be NULL "
"if the cached scope is NULL."));
exec_scope_ = exec_scope; exec_scope_ = exec_scope;
CallOnce(); CallOnce();
} else { } else {
PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same"); PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
platform::errors::InvalidArgument(
"The given execution scope and the cached execution "
"scope should be the same."));
} }
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
...@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { ...@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
} else { } else {
out_tensor->ShareBufferWith(in_tensor); out_tensor->ShareBufferWith(in_tensor);
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of
// output as the same as input.
if (share_dims_) {
out_tensor->Resize(in_tensor.dims());
}
VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
<< in_var_info->Name() << " -> " << out_var_names_[i]; << in_var_info->Name() << " -> " << out_var_names_[i];
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor { ...@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
ShareTensorBufferFunctor( ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
void operator()(Scope *exec_scope); void operator()(Scope *exec_scope);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor { ...@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
std::vector<std::string> out_var_names_; std::vector<std::string> out_var_names_;
std::vector<std::pair<const Variable *, Variable *>> in_out_vars_; std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of output
// as the same as input.
bool share_dims_{false};
}; };
} // namespace details } // namespace details
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle( ...@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
for (ir::Node *pending_op : out_var->outputs) { for (ir::Node *pending_op : out_var->outputs) {
auto &op = pending_op->Wrapper<OpHandleBase>(); auto &op = pending_op->Wrapper<OpHandleBase>();
auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op); auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
PADDLE_ENFORCE_NOT_NULL(compute_op); PADDLE_ENFORCE_NOT_NULL(
compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be ComputationOpHandle."));
if (result_op == nullptr) { if (result_op == nullptr) {
result_op = compute_op; result_op = compute_op;
} else { } else {
PADDLE_ENFORCE_EQ(result_op, compute_op); PADDLE_ENFORCE_EQ(
result_op, compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be the unique one."));
} }
} }
} }
PADDLE_ENFORCE_NOT_NULL(result_op); PADDLE_ENFORCE_NOT_NULL(result_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should not be NULL."));
return result_op; return result_op;
} }
ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: OpHandleBase(node), : OpHandleBase(node),
functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {} functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
share_dims) {}
std::unordered_map<std::string, std::string> std::unordered_map<std::string, std::string>
ShareTensorBufferOpHandle::ReusedVars() const { ShareTensorBufferOpHandle::ReusedVars() const {
...@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair( ...@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
functor_.AddReuseVarPair(in_var_info, out_var_name); functor_.AddReuseVarPair(in_var_info, out_var_name);
} }
void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
functor_.SetShareDims(share_dims);
}
void ShareTensorBufferOpHandle::InitCUDA() { void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int dev_id = int dev_id =
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
...@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ...@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
ir::Node *node, Scope *scope, size_t scope_idx, ir::Node *node, Scope *scope, size_t scope_idx,
const std::string &op_type, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_vars_infos, const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ...@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims);
const ShareTensorBufferFunctor &Functor() const { return functor_; } const ShareTensorBufferFunctor &Functor() const { return functor_; }
protected: protected:
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle { namespace paddle {
...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) { ...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr || PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr, dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
true, true,
"The input ops of ClearFetchOp function should be " platform::errors::PreconditionNotMet(
"FetchOpHandle or FetchAsyncOpHandle."); "The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle."));
for (auto& out_var : op->Node()->outputs) { for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var); graph->RemoveNode(out_var);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( ...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
} }
} }
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE_EQ(
ready_ops.empty(), true,
platform::errors::Fatal("After the execution of computation graph, "
"there are unexecuted operators left."));
} }
// Wait FetchOps. // Wait FetchOps.
...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
FetchResultType *fetch_data, bool return_merged) { FetchResultType *fetch_data, bool return_merged) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars; std::unordered_set<VarHandleBase *> local_ready_vars;
std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end()); for (auto &fetch_var_name : fetch_tensors) {
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
ready_ops->insert(static_cast<OpHandleBase *>(op)); ready_ops->insert(static_cast<OpHandleBase *>(op));
} }
} }
PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); PADDLE_ENFORCE_EQ(
local_ready_vars.size(), 0,
platform::errors::Fatal(
"The number of ready variables should be 0, but got %d.",
local_ready_vars.size()));
} }
void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingOp(
...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() { ...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
} }
} }
op_deps_->num_ops_ = ready_ops.size() + pending_ops.size(); op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(
op_deps_->num_ops_, 0,
platform::errors::InvalidArgument("The graph doesn't have operators."));
for (auto ready_var : ready_vars) { for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var); pending_vars.erase(ready_var);
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <ThreadPool.h> // ThreadPool in thrird party
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <list> #include <list>
...@@ -24,8 +26,6 @@ ...@@ -24,8 +26,6 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <ThreadPool.h> // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
......
...@@ -54,8 +54,10 @@ struct VarHandleBase { ...@@ -54,8 +54,10 @@ struct VarHandleBase {
void AddOutput(OpHandleBase* out, ir::Node* node) { void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) { if (pending_ops_.find(out) == pending_ops_.end()) {
PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(out,
this->Node()->Name()); platform::errors::InvalidArgument(
"The output added to VarHandle %s is NULL.",
this->Node()->Name()));
pending_ops_.insert(out); pending_ops_.insert(out);
node_->outputs.push_back(node); node_->outputs.push_back(node);
} }
...@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase { ...@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
bool HasEvent() { return has_event_; } bool HasEvent() { return has_event_; }
const cudaEvent_t& GetEvent() { const cudaEvent_t& GetEvent() {
PADDLE_ENFORCE(HasEvent(), "The event is not set."); PADDLE_ENFORCE_EQ(
HasEvent(), true,
platform::errors::PreconditionNotMet(
"The cuda event is not set, maybe InitCUDA() is not called."));
return event_; return event_;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) { ...@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>()); (*func)(var->GetMutable<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.",
ToTypeName(var->Type())));
} }
} }
...@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) { ...@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>()); (*func)(var.Get<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
} }
} }
...@@ -50,7 +54,8 @@ struct TensorVisitor { ...@@ -50,7 +54,8 @@ struct TensorVisitor {
template <typename T> template <typename T>
void operator()() { void operator()() {
PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name()); PADDLE_THROW(platform::errors::Unimplemented(
"Getting tensor from type %s is not supported.", typeid(T).name()));
} }
}; };
...@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor { ...@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
} }
}; };
...@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { ...@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
} }
struct EnforceShapeAndDTypeEQVisitor { struct EnforceShapeAndDTypeEQVisitor {
const Variable* trg_; const Variable* dst_;
void operator()(const LoDTensor& src) { void operator()(const LoDTensor& src) {
auto& tensor = trg_->Get<LoDTensor>(); auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
src.place().which(), tensor.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(), PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(), "The dtype of the two variables is not equal."));
"The dims of the two Variable is not equal."); PADDLE_ENFORCE_EQ(
src.dims(), tensor.dims(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(), PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
"The lod of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(), "The lod of the two variable is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.layout(), tensor.layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors tensor is not equal."));
} }
void operator()(const SelectedRows& src) { void operator()(const SelectedRows& src) {
auto& selected_rows = trg_->Get<SelectedRows>(); auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
src.place().which(), selected_rows.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(), "The dtype of the two variables is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.value().layout(), selected_rows.value().layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(), PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
"The height of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The height of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(), PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
"The dims of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The dims of the two variables is not equal."));
} }
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "EnforceShapeAndDTypeEQ is not supported for type %s.",
typeid(T).name()));
} }
}; };
......
...@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker { ...@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
skip_vars_ = skip_vars; skip_vars_ = skip_vars;
} }
static void ResetBatchId() { batch_id_ = 0; } static void ResetBatchId() { batch_id_ = 0; }
static void ResetThreadCompletedFlag() { threads_completed = false; }
static std::atomic<int> cpu_id_; static std::atomic<int> cpu_id_;
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
namespace gloo { namespace gloo {
namespace rendezvous { namespace rendezvous {
constexpr int kNodeSize = 136;
HdfsStore::HdfsStore(const std::string& path) { HdfsStore::HdfsStore(const std::string& path) {
path_ = path; path_ = path;
wait_sleep_ms_ = 10000; wait_sleep_ms_ = 10000;
...@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh( ...@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
storeKey << rank; storeKey << rank;
store.set(storeKey.str(), allBytes); store.set(storeKey.str(), allBytes);
auto total_add_size = kNodeSize * (size - 1);
std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_); std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
// Connect every pair // Connect every pair
for (uint32_t i = 0; i < connect_threads.size(); ++i) { for (uint32_t i = 0; i < connect_threads.size(); ++i) {
connect_threads[i].reset(new std::thread( connect_threads[i].reset(new std::thread(
[&store, &transportContext, this](size_t thread_idx, [&store, &transportContext, total_add_size, this](
size_t thread_num) -> void { size_t thread_idx, size_t thread_num) -> void {
for (int i = thread_idx; i < size; i += thread_num) { for (int i = thread_idx; i < size; i += thread_num) {
if (i == rank) { if (i == rank) {
continue; continue;
...@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh( ...@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
// Wait for address of other side of this pair to become available // Wait for address of other side of this pair to become available
std::string key = std::to_string(i); std::string key = std::to_string(i);
store.wait({key}, getTimeout()); store.wait({key}, getTimeout());
std::vector<char> allAddrs;
auto max_retry_times = 5;
// Connect to other side of this pair // Connect to other side of this pair
auto allAddrs = store.get(key);
while (max_retry_times > 0) {
allAddrs = store.get(key);
VLOG(3) << "store get all address size: " << allAddrs.size()
<< " except: " << total_add_size;
if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
break;
}
--max_retry_times;
}
auto addr = extractAddress(allAddrs, i); auto addr = extractAddress(allAddrs, i);
transportContext->getPair(i)->connect(addr); transportContext->getPair(i)->connect(addr);
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass, ...@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass); paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("affine_channel", 0));
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass, ...@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass); paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass, REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass); paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("batch_norm", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("batch_norm", 0));
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAdd2ActFusePass); paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_act_fuse_pass, REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
paddle::framework::ir::ConvElementwiseAddActFusePass); paddle::framework::ir::ConvElementwiseAddActFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("identity", 0));
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_elementwise_add_fuse_pass, REGISTER_PASS(conv_elementwise_add_fuse_pass,
paddle::framework::ir::ConvElementwiseAddFusePass); paddle::framework::ir::ConvElementwiseAddFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0));
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { ...@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass, REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass); paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("lookup_table", 0)
.EQ("elementweise_add", 0));
...@@ -16,12 +16,13 @@ limitations under the License. */ ...@@ -16,12 +16,13 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
TEST(SkipLayerNormFusePass, basic) { TEST(EmbeddingElewiseLayernormFusePass, basic) {
// inputs operator output // inputs operator output
// -------------------------------------------------------------------- // --------------------------------------------------------------------
// (x, y) elementwise_add -> elementwise_out // (x, y) elementwise_add -> elementwise_out
...@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) { ...@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
"The number of fusion nodes does not meet expectations after fuse")); "The number of fusion nodes does not meet expectations after fuse"));
} }
TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
// Build pattern // Build pattern
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
->assert_is_op_input("lookup_table") ->assert_is_op_input("lookup_table_v2")
->assert_var_not_persistable(); ->assert_var_not_persistable();
patterns::Embedding embedding_pattern(pattern, name_scope); patterns::Embedding embedding_pattern(pattern, name_scope);
// TODO(jczaja): Intermediate can only be for val that are not used anywhere // TODO(jczaja): Intermediate can only be for val that are not used anywhere
...@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(embedding_fc_lstm_fuse_pass, REGISTER_PASS(embedding_fc_lstm_fuse_pass,
paddle::framework::ir::EmbeddingFCLSTMFusePass); paddle::framework::ir::EmbeddingFCLSTMFusePass);
REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("lookup_table_v2", 0)
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fused_embedding_fc_lstm", 0));
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { ...@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass) REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
.RequirePassAttr("use_gpu"); .RequirePassAttr("use_gpu");
REGISTER_PASS_CAPABILITY(fc_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0)
.EQ("fc", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
auto* x_n = subgraph.at(x); auto* x_n = subgraph.at(x);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
...@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
gru_pattern); gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
// TODO(wilber): Support origin_mode=True.
if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
return;
}
if (with_fc_bias) { if (with_fc_bias) {
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias); gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes. // Remove unneeded nodes.
...@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass); REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass); REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("gru", 0)
.EQ("fusion_gru", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass); REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("lstm", 0)
.EQ("fusion_lstm", 0));
...@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl ...@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass)
cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op) cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
...@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { ...@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
VLOG(4) << "Inplace performed in op " << op_type << ": " VLOG(4) << "Inplace performed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> " << in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name() << out_var_handle_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString(); << ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
} else { } else {
VLOG(3) << "Inplace failed in op " << op_type << ": " VLOG(3) << "Inplace failed in op " << op_type << ": "
<< in_var_handle_ptr->Name() << " -> " << in_var_handle_ptr->Name() << " -> "
<< out_var_handle_ptr->Name(); << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
} }
} }
} }
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class InplaceAddToOpPass : public MemoryReusePass {
protected:
std::string ReuseType() const override { return "inplace_addto"; }
void Run(Graph *graph) const override;
private:
// 1. Add last living op of in_var, add any last living op of out_var
// 2. Set reference count of in_var to be 2
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const override {
size_t scope_idx = op->GetScopeIdx();
auto *last_live_ops_of_vars_ =
&Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
auto out_var_op_iter =
(*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
// In Reduce mode, some output variable(gradient of parameter) does not have
// last live ops
details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
last_live_op_of_in_var = op;
} else {
PADDLE_ENFORCE_EQ(
out_var_op_iter->second.ops().empty(), false,
platform::errors::InvalidArgument(
"Var(%s)'s last live op should not empty.", out_var->Name()));
last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
}
auto *last_live_ops_of_in_var =
(*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
// last_live_ops_of_in_var->clear();
last_live_ops_of_in_var->insert(last_live_op_of_in_var);
auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
PADDLE_ENFORCE_NE(
in_var_info_iter, (*var_infos_)[scope_idx].end(),
platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
in_var_info_iter->second->SetRefCnt(2); // before inplace, it is 1
}
};
void InplaceAddToOpPass::Run(Graph *graph) const {
const auto &last_live_ops =
Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
bool use_cuda = Get<bool>(kUseCuda);
// Currently, only perform InplaceAddToOpPass on cuda place
if (!use_cuda) {
return;
}
// Step 1: Build a reverse map of last_live_ops
// i.e.: op -> vars
std::unordered_map<details::ComputationOpHandle *,
std::unordered_map<std::string, ir::Node *>>
candidate_ops;
for (auto &each_scope_ops : last_live_ops) {
for (auto &pair : each_scope_ops) {
// If variable has more than 1 last lived ops, this variable cannot
// be inplaced.
if (pair.second.ops().size() != 1) {
continue;
}
auto *op = *(pair.second.ops().begin());
const std::string &op_type = op->GetOp()->Type();
const framework::OpDesc *op_desc = op->Node()->Op();
PADDLE_ENFORCE_NOT_NULL(
op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
op->Name()));
// only grad op should be processed.
if (op_type != "grad_add") {
continue;
}
const std::string &var_name = pair.first;
auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
if (in_nodes.size() == 1) {
candidate_ops[op][var_name] = *in_nodes.begin();
}
VLOG(4) << "Find op " << op_type << " with input(" << var_name
<< ") that can do inplace add to";
}
}
// Step 2: Check which vars can be inplaced indeed
for (auto &op_vars_pair : candidate_ops) {
auto *op = op_vars_pair.first;
// The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
// could be changed as follws if inplace addto is enabled:
// g_sum_0 = g_0
// g_sum_1 = grad_add(g_sum_0, g_1)
// g_sum_2 = grad_add(g_sum_1, g_2)
// ...
// g_sum_n = grad_add(g_sum_n-1, g_n)
// here we will add inplace for each grad_add, for example, for the first
// grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
const std::string &op_type = op->GetOp()->Type();
PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
platform::errors::InvalidArgument(
"The size of inputs of %s should be 2, but got %d",
op_type, op->Node()->inputs.size()));
PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
platform::errors::InvalidArgument(
"The size of outputs of %s should be 1, but got %d",
op_type, op->Node()->outputs.size()));
auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
&(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
out_var_ptr == nullptr) {
continue;
}
// auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
// left_var_ptr->GeneratedOp());
auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
right_var_ptr->GeneratedOp());
auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
out_var_ptr->GeneratedOp());
// NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
if (right_generated_op->Name() != "conv2d_grad") {
continue;
}
// NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
// before b. However, in the situation of inplace addto, we do not care
// the order, since a+b is equal to b+a. Is there any exception for that?
// AddDependencyVar(right_generated_op, left_generated_op);
// no need, as discussed above.
// step (a): inplace right_var->left_var of grad_add
this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
<< ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
<< ". Debug String is: "
<< right_generated_op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (b): inplace out -> right_var of grad_add
this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
VLOG(4) << "Inplace performed in op " << op_type << ": "
<< left_var_ptr->Name() << " -> " << out_var_ptr->Name()
<< ". Debug String is: " << op->GetOp()->DebugString()
<< ". ReuseType: " << ReuseType();
// step (c): make right_var cannot inplace afterwards. canbe done
// aotomatically since CollectReusedVars is called before any reuse.
// step (d): make right_var's generated op use addto
right_generated_op->GetOp()->SetAttr("use_addto", true);
// step (e): make grad_add skip running
op->SetSkipRunning(true);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
.RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
.RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
.RequirePassAttr(paddle::framework::ir::kUseCuda);
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
#include <functional> #include <functional>
#include <map> #include <map>
#include <string> #include <string>
...@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var, ...@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
out_var->Name())); out_var->Name()));
if (IsVarPairReusable(*in_var, *out_var)) { if (IsVarPairReusable(*in_var, *out_var)) {
AddReuseVar(op, in_var, out_var); AddReuseVar(op, in_var, out_var);
UpdateLastLiveOpOfVar(op, in_var, out_var);
return true; return true;
} else { } else {
return false; return false;
...@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable( ...@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
details::VarHandle *in_var, details::VarHandle *in_var,
details::VarHandle *out_var) const { details::VarHandle *out_var,
bool share_dims) const {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
(*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0, (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
platform::errors::NotFound("Var(%s) does not in mem opt var infos.", platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
...@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, ...@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
share_buffer_op->AddInput(in_var); share_buffer_op->AddInput(in_var);
} }
if (share_dims) {
share_buffer_op->SetShareDims(true);
}
share_buffer_op->AddReuseVarPair( share_buffer_op->AddReuseVarPair(
(*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(), (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
out_var->Name()); out_var->Name());
reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name()); reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name()); reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
UpdateLastLiveOpOfVar(op, in_var, out_var);
} }
// 1. Set last living op of in_var to be any last living op of out_var // 1. Set last living op of in_var to be any last living op of out_var
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
...@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass { ...@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
int64_t GetMemorySize(const details::VarHandle &var) const; int64_t GetMemorySize(const details::VarHandle &var) const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var, bool share_dims = false) const;
virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private: private:
VarDesc *GetVarDesc(const details::VarHandle &var) const; VarDesc *GetVarDesc(const details::VarHandle &var) const;
...@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass { ...@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
void CollectReusedVars() const; void CollectReusedVars() const;
void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
details::VarHandle *out_var) const;
void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
details::VarHandle *in_var,
details::VarHandle *out_var) const;
private: private:
mutable Graph *graph_; mutable Graph *graph_;
mutable bool use_cuda_; mutable bool use_cuda_;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "do not perform " + type() + "+bias fuse"; VLOG(3) << "do not perform " + type() + "+bias fuse";
return; return;
} }
if (conv->Op()->HasAttr("dilations")) {
auto dilations =
BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
for (const auto& d : dilations) {
if (d != 1) {
LOG(WARNING)
<< "dilation conv not supported in MKLDNN, fuse not apply "
<< "and set conv attribute use_mkldnn = false";
conv->Op()->SetAttr("use_mkldnn", false);
return;
}
}
}
auto* eltwise_bias_tensor = auto* eltwise_bias_tensor =
scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>(); scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
...@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass, ...@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
paddle::framework::ir::Conv2DTransposeBiasFusePass); paddle::framework::ir::Conv2DTransposeBiasFusePass);
REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
paddle::framework::ir::Conv3DBiasFusePass); paddle::framework::ir::Conv3DBiasFusePass);
REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0));
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
...@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) { ...@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
ASSERT_EQ(pass.type(), std::string("conv2d_transpose")); ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
} }
TEST(ConvBiasFusePass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <memory> #include <memory>
#include <tuple> #include <tuple>
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { ...@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass, REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
paddle::framework::ir::ResidualConnectionMKLDNNFusePass); paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("elementwise_add", 0));
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) { ...@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
AssertOpsCount(graph, 2, 1); AssertOpsCount(graph, 2, 1);
} }
TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { ...@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(depthwise_conv_mkldnn_pass, REGISTER_PASS(depthwise_conv_mkldnn_pass,
paddle::framework::ir::DepthwiseConvMKLDNNPass); paddle::framework::ir::DepthwiseConvMKLDNNPass);
REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"depthwise_conv2d", 0));
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() { ...@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
return prog; return prog;
} }
TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("depthwise_conv_mkldnn_pass"));
}
TEST(DepthwiseConvMKLDNNPass, basic) { TEST(DepthwiseConvMKLDNNPass, basic) {
auto prog = BuildProgramDesc(); auto prog = BuildProgramDesc();
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
namespace paddle { namespace paddle {
...@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass, ...@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
REGISTER_PASS(multihead_matmul_fuse_pass_v2, REGISTER_PASS(multihead_matmul_fuse_pass_v2,
paddle::framework::ir::MultiHeadMatmulV2FusePass); paddle::framework::ir::MultiHeadMatmulV2FusePass);
REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.EQ("matmul", 0)
.EQ("softmax", 0));
...@@ -12,6 +12,7 @@ limitations under the License. */ ...@@ -12,6 +12,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h" // NOLINT #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h" // NOLINT
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) { ...@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
num_fused_nodes_after)); num_fused_nodes_after));
} }
TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("multihead_matmul_fuse_pass_v2"));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#define MAX_NUM_FC 10 #define MAX_NUM_FC 10
...@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, ...@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) { if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
return false; return false;
} }
if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
LOG(WARNING) << "repeated fc relu only supports input dims = 2";
return false;
}
int fc_idx = FindFCIdx(x); int fc_idx = FindFCIdx(x);
if (fc_idx < 0) { if (fc_idx < 0) {
return false; return false;
...@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(repeated_fc_relu_fuse_pass, REGISTER_PASS(repeated_fc_relu_fuse_pass,
paddle::framework::ir::RepeatedFCReluFusePass); paddle::framework::ir::RepeatedFCReluFusePass);
REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("fc", 0)
.EQ("relu", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(seqconv_eltadd_relu_fuse_pass, REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
paddle::framework::ir::SeqConvEltAddReluFusePass); paddle::framework::ir::SeqConvEltAddReluFusePass);
REGISTER_PASS_CAPABILITY(seqconv_eltadd_relu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("sequence_conv", 0)
.EQ("elementwise_add", 0)
.EQ("relu", 0));
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h" #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { ...@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "shufflechannel_pattern"; const std::string pattern_name = "shufflechannel_pattern";
FusePassBase::Init(pattern_name, graph); FusePassBase::Init(pattern_name, graph);
LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
"use it instead of (reshape + transpose +reshape)";
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern() auto* x = gpd.mutable_pattern()
->NewNode("x") ->NewNode("x")
...@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { ...@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(shuffle_channel_detect_pass, REGISTER_PASS(shuffle_channel_detect_pass,
paddle::framework::ir::ShuffleChannelDetectPass); paddle::framework::ir::ShuffleChannelDetectPass);
REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("reshape2", 0)
.EQ("transpose2", 0));
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { ...@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS(skip_layernorm_fuse_pass, REGISTER_PASS(skip_layernorm_fuse_pass,
paddle::framework::ir::SkipLayerNormFusePass); paddle::framework::ir::SkipLayerNormFusePass);
REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("elementwise_add", 0)
.EQ("layer_norm", 0));
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) { ...@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
"The number of fusion nodes does not meet expectations after fuse")); "The number of fusion nodes does not meet expectations after fuse"));
} }
TEST(SkipLayerNormFusePass, pass_op_version_check) {
ASSERT_TRUE(
paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
.IsPassCompatible("skip_layernorm_fuse_pass"));
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
}; };
auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) { auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
bool basic = var_is_op_input(x, "matmul", arg_name) && bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
var_is_op_input(x, "matmul", arg_name)) &&
var_is_op_input(x, "square", "X"); var_is_op_input(x, "square", "X");
if (!basic) { if (!basic) {
return false; return false;
...@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
} }
auto* squared_x = squared_x_op->outputs[0]; auto* squared_x = squared_x_op->outputs[0];
bool next_is_matmul_from_arg = bool next_is_matmul_from_arg =
var_is_op_input(squared_x, "matmul", arg_name) && (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
var_is_op_input(squared_x, "matmul", arg_name)) &&
squared_x->outputs.size() == 1 && squared_x->outputs.size() == 1 &&
squared_x->outputs[0]->outputs.size() == 1; squared_x->outputs[0]->outputs.size() == 1;
if (!next_is_matmul_from_arg) { if (!next_is_matmul_from_arg) {
...@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_first_mul_out = [=](Node* x) -> bool { auto is_fusion_first_mul_out = [=](Node* x) -> bool {
bool input_is_matmul_op = x && x->inputs.size() == 1 && bool input_is_matmul_op = x && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() && x->inputs[0]->IsOp() &&
x->inputs[0]->Op()->Type() == "matmul"; (x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!input_is_matmul_op) { if (!input_is_matmul_op) {
return false; return false;
} }
...@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_xy_op = pattern->NewNode( auto* matmul_xy_op = pattern->NewNode(
[=](Node* x) { [=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" && return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_first_mul_out(x->outputs[0]); is_fusion_first_mul_out(x->outputs[0]);
}, },
name_scope + "/matmul_xy_op"); name_scope + "/matmul_xy_op");
...@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool { auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
bool basic = x && x->IsVar() && x->inputs.size() == 1 && bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul"; x->inputs[0]->IsOp() &&
(x->inputs[0]->Op()->Type() == "matmul_v2" ||
x->inputs[0]->Op()->Type() == "matmul");
if (!basic) { if (!basic) {
return false; return false;
} }
...@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, ...@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
auto* matmul_squared_x_y_op = pattern->NewNode( auto* matmul_squared_x_y_op = pattern->NewNode(
[=](Node* x) { [=](Node* x) {
return x && x->IsOp() && x->Op()->Type() == "matmul" && return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
x->Op()->Type() == "matmul") &&
is_fusion_mat_squared_x_y_op_out(x->outputs[0]); is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
}, },
name_scope + "/matmul_squared_x_y_op"); name_scope + "/matmul_squared_x_y_op");
...@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(squared_mat_sub_fuse_pass, REGISTER_PASS(squared_mat_sub_fuse_pass,
paddle::framework::ir::SquaredMatSubFusePass); paddle::framework::ir::SquaredMatSubFusePass);
REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("matmul", 0)
.EQ("matmul_v2", 0)
.EQ("square", 0)
.EQ("elementwise_mul", 0)
.EQ("elementwise_sub", 0)
.EQ("fill_constant", 0)
.EQ("fusion_squared_mat_sub", 0));
...@@ -24,7 +24,7 @@ namespace framework { ...@@ -24,7 +24,7 @@ namespace framework {
namespace ir { namespace ir {
/** /**
* Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
*/ */
class SquaredMatSubFusePass : public FusePassBase { class SquaredMatSubFusePass : public FusePassBase {
public: public:
......
...@@ -157,6 +157,14 @@ class OperatorBase { ...@@ -157,6 +157,14 @@ class OperatorBase {
platform::errors::NotFound("(%s) is not found in AttributeMap.", name)); platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
return BOOST_GET_CONST(T, attrs_.at(name)); return BOOST_GET_CONST(T, attrs_.at(name));
} }
void SetAttr(const std::string& name, const Attribute& v) {
PADDLE_ENFORCE_EQ(
HasAttr(name), true,
platform::errors::NotFound(
"The attribute %s is not found in operator %s", name, Type()));
attrs_[name] = v;
}
const AttributeMap& Attrs() const { return attrs_; } const AttributeMap& Attrs() const { return attrs_; }
const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Inputs() const { return inputs_; }
......
...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -108,6 +110,11 @@ class ParallelExecutorPrivate { ...@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
* them. * them.
*/ */
inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) { inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
if (mem_opt_var_infos_.size() == 0) {
VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
"optimization strategy is enabled";
return;
}
auto iter = mem_opt_var_infos_[scope_idx].find(name); auto iter = mem_opt_var_infos_[scope_idx].find(name);
if (iter != mem_opt_var_infos_[scope_idx].end()) { if (iter != mem_opt_var_infos_[scope_idx].end()) {
iter->second->SetSkipMemoryReuse(true); iter->second->SetSkipMemoryReuse(true);
...@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
} }
bool need_mem_opt = build_strategy_.enable_inplace_ || bool need_mem_opt = build_strategy_.enable_inplace_ ||
build_strategy_.enable_addto_ ||
build_strategy_.memory_optimize_.get() || is_gc_enabled; build_strategy_.memory_optimize_.get() || is_gc_enabled;
if (!need_mem_opt) return graph; if (!need_mem_opt) return graph;
...@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
graph = ref_cnt_pass->Apply(graph); graph = ref_cnt_pass->Apply(graph);
VLOG(10) << "ReferenceCountPass Applied"; VLOG(10) << "ReferenceCountPass Applied";
if (build_strategy_.enable_addto_) {
auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
VLOG(10) << "Start to apply inplace_addto_op_pass";
graph = addto_pass->Apply(graph);
VLOG(10) << "inplace_addto_op_pass Applied";
}
if (build_strategy_.enable_inplace_) { if (build_strategy_.enable_inplace_) {
auto inplace_pass = auto inplace_pass =
ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
...@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass); ...@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass); USE_PASS(eager_deletion_pass);
USE_PASS(buffer_shared_inplace_pass); USE_PASS(buffer_shared_inplace_pass);
USE_PASS(buffer_shared_cross_op_memory_reuse_pass); USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
USE_PASS(inplace_addto_op_pass);
...@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() { ...@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
} }
root_scope_->DropKids(); root_scope_->DropKids();
SectionWorker::ResetBatchId(); SectionWorker::ResetBatchId();
SectionWorker::ResetThreadCompletedFlag();
} }
Scope* PipelineTrainer::GetWorkerScope(int thread_id) { Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
......
...@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() { ...@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
if (threads_completed) { if (threads_completed) {
VLOG(3) << "thread " << thread_id_ << " completed."; VLOG(3) << "thread " << thread_id_ << " completed.";
lk.unlock(); lk.unlock();
threads_completed = false;
return; return;
} }
lk.unlock(); lk.unlock();
...@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
<< ", mean_time: " << op_total_time[i] / op_count[i]; << ", mean_time: " << op_total_time[i] / op_count[i];
} }
VLOG(0) << "================================"; VLOG(0) << "================================";
threads_completed = false;
return; return;
} }
lk.unlock(); lk.unlock();
......
...@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// "seqpool_concat_fuse_pass", // // "seqpool_concat_fuse_pass", //
"seqpool_cvm_concat_fuse_pass", // "seqpool_cvm_concat_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // // TODO(wilber): fix correctness problem.
// "fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", // "fc_gru_fuse_pass", //
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
......
...@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
plugin::DynamicPluginTensorRT* plugin = nullptr; auto use_fp16 = engine_->WithFp16();
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>( auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps); eps, use_fp16);
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -32,13 +32,34 @@ namespace plugin { ...@@ -32,13 +32,34 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
template <typename T> template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::initialize() { EmbEltwiseLayernormPluginDynamicImpl<
T>::~EmbEltwiseLayernormPluginDynamicImpl() {
this->terminate();
}
inline half fp32tofp16(float x) { return static_cast<half>(x); }
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
embs_gpu_.resize(embs_.size()); embs_gpu_.resize(embs_.size());
for (int i = 0; i < embs_.size(); i++) { for (int i = 0; i < embs_.size(); i++) {
if (embs_[i]) { if (embs_[i]) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); T *host_ptr;
cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float), auto size = emb_sizes_[i];
if (std::is_same<T, half>::value) {
host_ptr = new T[size];
std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
} else {
host_ptr = reinterpret_cast<T *>(embs_[i]);
}
cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
if (std::is_same<T, half>::value) {
delete[] host_ptr;
}
} }
} }
...@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() { ...@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
} }
int input_num = embs_.size();
in_ptr_tensor_.Resize({input_num});
emb_ptr_tensor_.Resize({input_num});
cudaGetDevice(&device_id_);
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
cudaMemcpyHostToDevice);
return 0; return 0;
} }
template <typename T> template <typename T>
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
for (int i = 0; i < embs_gpu_.size(); ++i) {
if (embs_gpu_[i]) {
cudaFree(embs_gpu_[i]);
embs_gpu_[i] = nullptr;
}
}
if (bias_gpu_) {
cudaFree(bias_gpu_);
bias_gpu_ = nullptr;
}
if (scale_gpu_) {
cudaFree(scale_gpu_);
scale_gpu_ = nullptr;
}
}
template <typename T>
int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims;
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
auto in_ptr_gpu_d =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto emb_ptr_gpu_d =
emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
if (old_input_ptr_ != new_input_ptr) {
old_input_ptr_ = new_input_ptr;
cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
stream);
}
auto out_type = output_desc[0].type;
if (std::is_same<T, float>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (std::is_same<T, half>::value) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
auto *output_d = reinterpret_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
return cudaGetLastError() != cudaSuccess;
}
template class EmbEltwiseLayernormPluginDynamicImpl<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamicImpl<half>;
#endif // SUPPORTS_CUDA_FP16
int EmbEltwiseLayernormPluginDynamic::initialize() {
impl_->initialize();
return 0;
}
void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { // NOLINT nvinfer1::IExprBuilder &expr_builder) { // NOLINT
PADDLE_ENFORCE_EQ(output_index, 0, PADDLE_ENFORCE_EQ(output_index, 0,
...@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( ...@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret; return ret;
} }
template <typename T> bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}
if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}
template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) { int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
"The EmbEltwiseLayerNorm's output should be one" "The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.", "but it's (%d) outputs.",
nb_outputs)); nb_outputs));
PADDLE_ENFORCE_EQ(nb_outputs, 1,
platform::errors::InvalidArgument(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs.",
nb_outputs));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs, pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the " platform::errors::InvalidArgument("The pos(%d) should be less than the "
...@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
} }
if (pos == all_nums - 1) { if (pos == all_nums - 1) {
if (sizeof(T) == sizeof(float)) { if (with_fp16_ == false) {
return desc.type == nvinfer1::DataType::kFLOAT; return desc.type == nvinfer1::DataType::kFLOAT;
} else { } else {
return desc.type == nvinfer1::DataType::kHALF; return desc.type == nvinfer1::DataType::kHALF;
...@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( ...@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
return false; return false;
} }
template <typename T> nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const { int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
index, 0, platform::errors::InvalidArgument( index, 0, platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only has one input, so the " "The EmbEltwiseLayernorm Plugin only has one input, so the "
"index value should be 0, but get %d.", "index value should be 0, but get %d.",
index)); index));
return nvinfer1::DataType::kFLOAT; if (with_fp16_)
return nvinfer1::DataType::kHALF;
else
return nvinfer1::DataType::kFLOAT;
} }
template <typename T> int EmbEltwiseLayernormPluginDynamic::enqueue(
int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) { void *const *outputs, void *workspace, cudaStream_t stream) {
auto id_dims = input_desc[0].dims; impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
int batch = id_dims.d[0];
int seq_len = id_dims.d[1];
int input_num = embs_.size();
framework::Tensor in_ptr_tensor, emb_ptr_tensor;
int device_id;
cudaGetDevice(&device_id);
in_ptr_tensor.Resize({input_num});
emb_ptr_tensor.Resize({input_num});
int64_t *in_ptr_gpu_d =
in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
}
cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
cudaMemcpyHostToDevice, stream);
auto out_type = output_desc[0].type;
const unsigned tpb = 256;
const dim3 grid(seq_len, batch, 1);
const dim3 block(tpb, 1, 1);
if (sizeof(T) == sizeof(float)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kFLOAT, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp32 input."));
} else if (sizeof(T) == sizeof(int16_t)) {
PADDLE_ENFORCE_EQ(
out_type == nvinfer1::DataType::kHALF, true,
platform::errors::InvalidArgument(
"The EmbEltwiseLayernorm Plugin only support fp16 input."));
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."));
}
T *output_d = static_cast<T *>(outputs[0]);
operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
eps_, input_num, stream);
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
template class EmbEltwiseLayernormPluginDynamic<float>;
#ifdef SUPPORTS_CUDA_FP16
template class EmbEltwiseLayernormPluginDynamic<half>;
#endif // SUPPORTS_CUDA_FP16
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -27,14 +27,76 @@ namespace tensorrt { ...@@ -27,14 +27,76 @@ namespace tensorrt {
namespace plugin { namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class EmbEltwiseLayernormPluginDynamicImplBase {
public:
EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
virtual int initialize() = 0;
virtual void terminate() = 0;
virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) = 0;
};
template <typename T> template <typename T>
class EmbEltwiseLayernormPluginDynamicImpl
: public EmbEltwiseLayernormPluginDynamicImplBase {
public:
explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
float* bias, float* scale,
std::vector<int> emb_sizes,
int bias_size, int scale_size,
int hidden_size, float eps)
: embs_(input_embs),
bias_(bias),
scale_(scale),
emb_sizes_(emb_sizes),
bias_size_(bias_size),
scale_size_(scale_size),
hidden_size_(hidden_size),
eps_(eps) {}
~EmbEltwiseLayernormPluginDynamicImpl();
int initialize();
void terminate();
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream);
private:
std::vector<float*> embs_;
float* bias_{nullptr};
float* scale_{nullptr};
// data on devices
float* bias_gpu_{nullptr};
float* scale_gpu_{nullptr};
std::vector<T*> embs_gpu_;
std::vector<int> emb_sizes_;
int bias_size_;
int scale_size_;
int hidden_size_;
float eps_;
framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
int device_id_{0};
uintptr_t old_input_ptr_{0};
};
class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs, explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
float* bias, float* scale, float* bias, float* scale,
std::vector<int> emb_sizes, std::vector<int> emb_sizes,
int bias_size, int scale_size, int bias_size, int scale_size,
int hidden_size, float eps) int hidden_size, float eps,
bool with_fp16)
: embs_(input_embs), : embs_(input_embs),
bias_(bias), bias_(bias),
scale_(scale), scale_(scale),
...@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
bias_size_(bias_size), bias_size_(bias_size),
scale_size_(scale_size), scale_size_(scale_size),
hidden_size_(hidden_size), hidden_size_(hidden_size),
eps_(eps) {} eps_(eps),
with_fp16_(with_fp16),
own_host_buff_(false) {
if (with_fp16) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
}
EmbEltwiseLayernormPluginDynamic(void const* serial_data, EmbEltwiseLayernormPluginDynamic(void const* serial_data,
size_t serial_length) { size_t serial_length)
: own_host_buff_(true) {
DeserializeValue(&serial_data, &serial_length, &emb_sizes_); DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
embs_gpu_.resize(emb_sizes_.size());
embs_.resize(emb_sizes_.size()); embs_.resize(emb_sizes_.size());
for (size_t i = 0; i < emb_sizes_.size(); i++) { for (size_t i = 0; i < emb_sizes_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); auto size = emb_sizes_[i];
cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float), auto ptr = new float[size];
cudaMemcpyHostToDevice); memcpy(ptr, serial_data, sizeof(float) * size);
embs_[i] = ptr;
reinterpret_cast<char const*&>(serial_data) += reinterpret_cast<char const*&>(serial_data) +=
emb_sizes_[i] * sizeof(float); emb_sizes_[i] * sizeof(float);
serial_length -= emb_sizes_[i] * sizeof(float); serial_length -= emb_sizes_[i] * sizeof(float);
embs_[i] = nullptr;
} }
DeserializeValue(&serial_data, &serial_length, &bias_size_); DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_); DeserializeValue(&serial_data, &serial_length, &scale_size_);
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); if (bias_size_) {
cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float), bias_ = new float[bias_size_];
cudaMemcpyHostToDevice); memcpy(bias_, serial_data, sizeof(float) * bias_size_);
bias_ = nullptr; }
reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float); reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
serial_length -= bias_size_ * sizeof(float); serial_length -= bias_size_ * sizeof(float);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); if (scale_size_) {
cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float), scale_ = new float[scale_size_];
cudaMemcpyHostToDevice); memcpy(scale_, serial_data, sizeof(float) * scale_size_);
scale_ = nullptr; }
reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float); reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
serial_length -= scale_size_ * sizeof(float); serial_length -= scale_size_ * sizeof(float);
DeserializeValue(&serial_data, &serial_length, &hidden_size_); DeserializeValue(&serial_data, &serial_length, &hidden_size_);
DeserializeValue(&serial_data, &serial_length, &eps_); DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
if (with_fp16_) {
#ifdef SUPPORTS_CUDA_FP16
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
#else
PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half."));
#endif // SUPPORTS_CUDA_FP16
} else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_);
}
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
auto ptr = new EmbEltwiseLayernormPluginDynamic( auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_); eps_, with_fp16_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr; return ptr;
} }
...@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
} }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
void terminate() override;
size_t getSerializationSize() const override { size_t getSerializationSize() const override {
int sum_num = 0; int sum_num = 0;
...@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
sum_num += (bias_size_ + scale_size_) * sizeof(float); sum_num += (bias_size_ + scale_size_) * sizeof(float);
sum_num += SerializedSize(hidden_size_); sum_num += SerializedSize(hidden_size_);
sum_num += SerializedSize(eps_); sum_num += SerializedSize(eps_);
// sum_num += SerializedSize(with_fp16_); sum_num += SerializedSize(with_fp16_);
return sum_num; return sum_num;
} }
void terminate() override;
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_); SerializeValue(&buffer, emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) { for (size_t i = 0; i < emb_sizes_.size(); i++) {
SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]); auto size = emb_sizes_[i];
for (int j = 0; j < size; ++j) {
SerializeValue(&buffer, embs_[i][j]);
}
} }
SerializeValue(&buffer, bias_size_); SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_); SerializeValue(&buffer, scale_size_);
SerializeCudaPointer(&buffer, bias_gpu_, bias_size_); for (int i = 0; i < bias_size_; ++i) {
SerializeCudaPointer(&buffer, scale_gpu_, scale_size_); SerializeValue(&buffer, bias_[i]);
}
for (int i = 0; i < scale_size_; ++i) {
SerializeValue(&buffer, scale_[i]);
}
SerializeValue(&buffer, hidden_size_); SerializeValue(&buffer, hidden_size_);
SerializeValue(&buffer, eps_); SerializeValue(&buffer, eps_);
SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
...@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
const nvinfer1::DataType* input_types, const nvinfer1::DataType* input_types,
int nb_inputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override {
if (own_host_buff_) {
for (auto ptr : embs_) {
delete[] ptr;
}
delete[] bias_;
delete[] scale_;
}
delete impl_;
delete this;
}
private: private:
std::vector<float*> embs_; std::vector<float*> embs_;
float* bias_; float* bias_;
float* scale_; float* scale_;
// data on devices
float* bias_gpu_;
float* scale_gpu_;
std::vector<float*> embs_gpu_;
std::vector<int> emb_sizes_; std::vector<int> emb_sizes_;
int bias_size_; int bias_size_;
int scale_size_; int scale_size_;
int hidden_size_; int hidden_size_;
float eps_; float eps_;
bool with_fp16_;
bool own_host_buff_{false};
EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
}; };
class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
...@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { ...@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
nvinfer1::IPluginV2* deserializePlugin(const char* name, nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data, const void* serial_data,
size_t serial_length) override { size_t serial_length) override {
return new EmbEltwiseLayernormPluginDynamic<float>(serial_data, return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
serial_length);
} }
void setPluginNamespace(const char* lib_namespace) override { void setPluginNamespace(const char* lib_namespace) override {
......
...@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run(config, &out_data); // serialize run(config, &out_data); // serialize
run(*config_deser, &out_data); // deserialize run(*config_deser, &out_data); // deserialize
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6); EXPECT_NEAR(result[i], out_data[i], 1e-2);
} }
} }
...@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) { ...@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187}; std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result); trt_ernie(false, result);
} }
TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef SUPPORTS_CUDA_FP16
std::vector<float> result = {0.598336, 0.219558, 0.182106}; TEST(AnalysisPredictor, fp16) {
std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
trt_ernie(true, result); trt_ernie(true, result);
#endif
} }
#endif // SUPPORTS_CUDA_FP16
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
float average_window = ctx.Attr<float>("average_window"); float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window"); int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window"); int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
PADDLE_ENFORCE_LE(min_average_window, max_average_window, PADDLE_ENFORCE_LE(
"min_average_window shouldn't be larger than " min_average_window, max_average_window,
"max_average_window"); platform::errors::InvalidArgument(
"The min_average_window > "
"max_average_window is not right, min_average_window is %ld, "
"max_average_window is %ld.",
min_average_window, max_average_window));
// Get inputs // Get inputs
auto* param = ctx.Input<Tensor>("param"); auto* param = ctx.Input<Tensor>("param");
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#endif #endif
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
...@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
if (input_grad) { if (input_grad) {
// Because beta is zero, it is unnecessary to reset input_grad. // When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
...@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ctx, &transformed_input_grad_channel, input_grad); ctx, &transformed_input_grad_channel, input_grad);
} }
} }
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
...@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_data + i * group_offset_in, args2.odesc.desc(), input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size, &beta, args2.wdesc.desc(), workspace_size, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter)); filter_grad_data + i * group_offset_filter));
}, },
workspace_size); workspace_size);
...@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups; int group_offset_filter = W->numel() / groups;
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
if (ddO) { if (ddO) {
......
...@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() { ...@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection", AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used " "(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
...@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() { ...@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>(
"use_addto",
"(bool, default false) If use addto strategy or not, only used in "
"cudnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection", AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used " "(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
......
...@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase { ...@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value(); tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape"))); tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else { } else {
PADDLE_THROW( PADDLE_THROW(platform::errors::InvalidArgument(
"fake init op's output only" "fake init op's output only"
"supports SelectedRows and LoDTensor"); "supports SelectedRows and LoDTensor"));
} }
} }
}; };
......
...@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop( ...@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); platform::errors::PreconditionNotMet(
"Invalid number of blocks in server program. Expected "
"equal or greater than 2. Recieved %zu",
num_blocks));
// Prepare all the server block // Prepare all the server block
std::vector<int> optimize_blocks_list; std::vector<int> optimize_blocks_list;
...@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, ...@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
VLOG(3) << "reset sparse var: " << varname; VLOG(3) << "reset sparse var: " << varname;
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear(); var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
} else { } else {
PADDLE_THROW("The type of sparse var should be SelectedRows"); PADDLE_THROW(platform::errors::PreconditionNotMet(
"The type of sparse var should be SelectedRows"));
} }
} }
if (UNLIKELY(reset_all)) { if (UNLIKELY(reset_all)) {
...@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, ...@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(), math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
static_cast<float>(0)); static_cast<float>(0));
} else { } else {
PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]"); PADDLE_THROW(platform::errors::PreconditionNotMet(
"The type of dense var should be in [LoDTensor, Tensor]"));
} }
} }
} }
...@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
std::vector<std::string> pieces; std::vector<std::string> pieces;
split(grad_and_id, ':', &pieces); split(grad_and_id, ':', &pieces);
VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1]; VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(pieces.size(), 2,
PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0); platform::errors::PreconditionNotMet(
"Invalid format of grad_and_id argument. "
"Expected \"grad:block_id\". Recieved %s",
grad_and_id.c_str()));
PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
platform::errors::AlreadyExists(
"The gradient name %s has already existed in out_map",
pieces[0].c_str()));
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
(*out_map)[pieces[0]] = block_id; (*out_map)[pieces[0]] = block_id;
...@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); platform::errors::PreconditionNotMet(
"Invalid number of blocks in server program. Expected "
"equal or greater than 2. Recieved %zu",
num_blocks));
std::vector<int> block_list; std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) { for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid); block_list.push_back(blkid);
...@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames, ...@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
var->IsType<framework::Tensor>()) { var->IsType<framework::Tensor>()) {
dense_vars_.push_back(varname); dense_vars_.push_back(varname);
} else { } else {
PADDLE_THROW( PADDLE_THROW(platform::errors::PreconditionNotMet(
"The type of received var should be in [SelectedRows, LoDTensor, " "The type of received var should be in [SelectedRows, LoDTensor, "
"Tensor]."); "Tensor]."));
} }
} }
} }
...@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
split(prefetch_var_name_and_id, ':', &pieces); split(prefetch_var_name_and_id, ':', &pieces);
VLOG(3) << "after split, prefetch_var = " << pieces[0] VLOG(3) << "after split, prefetch_var = " << pieces[0]
<< ", id=" << pieces[1]; << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(
pieces.size(), 2,
platform::errors::PreconditionNotMet(
"Invalid format of prefetch_var_name_and_id argument. "
"Expected \"xxx:xxx\". Recieved %s",
prefetch_var_name_and_id.c_str()));
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
prefetch_block_id_list.push_back(block_id); prefetch_block_id_list.push_back(block_id);
...@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
sparse_grad_name_to_param_name_str) { sparse_grad_name_to_param_name_str) {
std::vector<std::string> pieces; std::vector<std::string> pieces;
split(sparse_grad_name_and_param_name, ':', &pieces); split(sparse_grad_name_and_param_name, ':', &pieces);
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(
pieces.size(), 2,
platform::errors::PreconditionNotMet(
"Invalid format of sparse_grad_name_and_param_name argument. "
"Expected \"xxx:xxx\". Recieved %s",
sparse_grad_name_and_param_name.c_str()));
VLOG(3) << "after split, sparse_grad_name = " << pieces[0] VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
<< ", param_name = " << pieces[1]; << ", param_name = " << pieces[1];
sparse_grad_name_to_param_name[pieces[0]] = pieces[1]; sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
......
...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle { namespace paddle {
...@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL( ...@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
int>, int>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext, ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>); int64_t>);
// A specialization elementwise_add operator, used in gradient accumulation with
// inplace addto.
REGISTER_OPERATOR(
grad_add, paddle::operators::ElementwiseOp,
paddle::operators::ElementwiseAddOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
grad_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>, ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
plat::float16>); plat::float16>);
REGISTER_OP_CUDA_KERNEL(
grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx, ...@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) { const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>( auto x_dims = x->dims();
ctx, x, y, axis, FloorDivFunctor<T>(), z); auto y_dims = y->dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
}
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel { ...@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty"); OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
if (context->HasInput("ShapeTensor")) { if (context->HasInput("ShapeTensor")) {
auto dims = context->GetInputDim("ShapeTensor"); auto shape_dims = context->GetInputDim("ShapeTensor");
int num_ele = 1; int num_ele = 1;
for (int i = 0; i < dims.size(); ++i) { for (int i = 0; i < shape_dims.size(); ++i) {
num_ele *= dims[i]; num_ele *= shape_dims[i];
} }
auto vec_dims = std::vector<int>(num_ele, -1);
context->SetOutputDim("Out", framework::make_ddim({num_ele})); context->SetOutputDim("Out", framework::make_ddim(vec_dims));
} else if (context->HasInputs("ShapeTensorList")) { } else if (context->HasInputs("ShapeTensorList")) {
std::vector<int> out_dims; std::vector<int> out_dims;
auto dims_list = context->GetInputsDim("ShapeTensorList"); auto dims_list = context->GetInputsDim("ShapeTensorList");
for (size_t i = 0; i < dims_list.size(); ++i) { for (size_t i = 0; i < dims_list.size(); ++i) {
auto& dims = dims_list[i]; auto& dims = dims_list[i];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
dims, framework::make_ddim({1}), platform::errors::InvalidArgument(
"ShapeError: The shape of Tensor in list must be [1]. " "The shape of Tensor in list must be [1]. "
"But received the shape " "But received the shape is [%s]",
"is [%s]", dims));
dims);
out_dims.push_back(-1);
out_dims.push_back(dims[0]);
} }
context->SetOutputDim("Out", framework::make_ddim(out_dims)); context->SetOutputDim("Out", framework::make_ddim(out_dims));
} else { } else {
auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape"); auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
for (size_t i = 0; i < shape.size(); ++i) {
PADDLE_ENFORCE_GE(
shape[i], 0,
platform::errors::InvalidArgument(
"Each value of attribute 'shape' is expected to be no less "
"than 0. But recieved: shape[%u] = %d; shape = [%s].",
i, shape[i], framework::make_ddim(shape)));
}
context->SetOutputDim("Out", framework::make_ddim(shape)); context->SetOutputDim("Out", framework::make_ddim(shape));
} }
} }
......
...@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> { ...@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
float>; float>;
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
auto* scale_data = scale.data<T>();
auto* in_data = in.data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
auto in_dims = in.dims();
const int64_t channel = in_dims[quant_axis];
platform::Transform<platform::CPUDeviceContext> trans;
if (quant_axis == 0) {
const int64_t channel_size = in.numel() / channel;
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
auto* start = in_data + i * channel_size;
auto* end = in_data + (i + 1) * channel_size;
trans(ctx, start, end, out_data + i * channel_size,
ClipFunctor<T>(-s, s));
}
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
T inv_s = inverse(s);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
out_e.device(*ctx.eigen_device()) =
(bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
}
} else if (quant_axis == 1) {
const int64_t step_i = in.numel() / in_dims[0];
const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
for (int i = 0; i < in_dims[0]; i++) {
for (int j = 0; j < in_dims[1]; j++) {
T s = scale_data[j];
T inv_s = inverse(s);
auto* start = in_data + i * step_i + j * step_j;
auto* end = in_data + i * step_i + (j + 1) * step_j;
auto* cur_out_data = out_data + i * step_i + j * step_j;
trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
for (int k = 0; k < step_j; k++) {
cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
s / static_cast<T>(bin_cnt);
}
}
}
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
float>;
template <typename T> template <typename T>
struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> { struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, void operator()(const platform::CPUDeviceContext& ctx,
...@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$ ...@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
} }
}; };
class FakeChannelWiseQuantizeDequantizeAbsMaxOp
: public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
"FakeChannelWiseQuantizeDequantizeAbsMax");
OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
"FakeChannelWiseQuantizeDequantizeAbsMax");
int quant_axis = ctx->Attrs().Get<int>("quant_axis");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized and dequantized low level tensor, "
"saved as float data type.");
AddOutput("OutScale", "(Tensor) Current channel wise scale");
AddAttr<int>("quant_axis",
"(int, default 0) The axis for quantization. "
"For conv2d, depthwise_conv2d, conv2d_transpose "
"and mul, the quant_axis is equal to the cout axis.")
.SetDefault(0)
.AddCustomChecker([](const int& quant_axis) {
PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
});
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
platform::errors::InvalidArgument(
"'bit_length' should be between 1 and 16, but "
"the received is %d",
bit_length));
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeRangeAbsMaxOp(const std::string& type, FakeQuantizeRangeAbsMaxOp(const std::string& type,
...@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale, ...@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp); REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad, REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CPU, float>); ops::FakeQuantDequantGradKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
...@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
} }
}; };
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, // ChannelClipAndQuantDequantKernel for quant_axis is 0
float>; template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
const T* in, const T* scale, const int bin_cnt, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
// ChannelClipAndQuantDequantKernel for quant_axis is 1
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
// At present, channelwise quantization supports conv2d, depthwise_conv2d
// conv2d_transpose and mul
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis0<
T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis1<
T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>); ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad, REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
ops::FakeQuantDequantGradKernel<CUDA, float>); ops::FakeQuantDequantGradKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_quantize_dequantize_abs_max,
ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
...@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor { ...@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
const int quant_axis, framework::Tensor* out); const int quant_axis, framework::Tensor* out);
}; };
template <typename DeviceContext, typename T>
struct ChannelClipFakeQuantDequantFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& scale, const int bin_cnt,
const int quant_axis, framework::Tensor* out);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor { struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
...@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
int quant_axis = context.Attr<int>("quant_axis");
FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
out_scale_data);
ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
......
...@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> { ...@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
copy_size += src_mat_w_sz; copy_size += src_mat_w_sz;
} }
// fill data // fill data
if (context_start > 0) {
src_data += context_start * src_mat_w;
}
for (int j = 0; j < seq_len - up_pad - down_pad; ++j) { for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
std::memcpy(dst_data, src_data, copy_size); std::memcpy(dst_data, src_data, copy_size);
dst_data += col_mat_w; dst_data += col_mat_w;
...@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> { ...@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
std::memset(dst_data, 0, down_pad * col_mat_w_sz); std::memset(dst_data, 0, down_pad * col_mat_w_sz);
copy_size -= src_mat_w_sz; copy_size -= src_mat_w_sz;
for (int j = 0; j < down_pad; ++j) { for (int j = 0; j < down_pad; ++j) {
if (copy_size < 0) {
copy_size = 0;
}
std::memcpy(dst_data, src_data, copy_size); std::memcpy(dst_data, src_data, copy_size);
dst_data += col_mat_w; dst_data += col_mat_w;
src_data += src_mat_w; src_data += src_mat_w;
copy_size -= src_mat_w_sz; copy_size -= src_mat_w_sz;
} }
} else { } else {
PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1,
platform::errors::InvalidArgument(
"context length must be bigger or equal than "
"up_pad + down_pad + 1, but received context "
"length is: %d, up_pad is: %d, down_pad is: %d.",
context_length, up_pad, down_pad));
std::memset(dst_data, 0, seq_len * col_mat_w_sz); std::memset(dst_data, 0, seq_len * col_mat_w_sz);
dst_data = dst_data + up_pad * src_mat_w; dst_data = dst_data + up_pad * src_mat_w;
int zero_sz = up_pad * src_mat_w_sz; int zero_sz = up_pad * src_mat_w_sz;
...@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> { ...@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
// from bottom // from bottom
dst_data = col_data + ed * col_mat_w; dst_data = col_data + ed * col_mat_w;
src_data = x_data + st * src_mat_w; src_data = x_data + st * src_mat_w;
if (context_start > 0) {
src_data += context_start * src_mat_w;
}
zero_sz = down_pad * src_mat_w_sz; zero_sz = down_pad * src_mat_w_sz;
for (int j = 1; j <= std::min(down_pad, seq_len); ++j) { for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz); int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
if (copy_size < 0) {
copy_size = 0;
}
std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T), std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w, src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
copy_size); copy_size);
......
...@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
lod[0].assign(high_level.begin(), high_level.end()); lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end()); lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) { if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); PADDLE_THROW(platform::errors::InvalidArgument(
"lod %s is not right in"
" beam_search, please check your code.",
framework::LoDToString(lod)));
} }
selected_ids->set_lod(lod); selected_ids->set_lod(lod);
selected_scores->set_lod(lod); selected_scores->set_lod(lod);
......
...@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
context.Wait(); context.Wait();
if (!framework::CheckLoD(selected_lod)) { if (!framework::CheckLoD(selected_lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod)); PADDLE_THROW(platform::errors::InvalidArgument(
"lod %s is not right in"
" beam_search, please check your code.",
framework::LoDToString(selected_lod)));
} }
selected_ids->set_lod(selected_lod); selected_ids->set_lod(selected_lod);
......
...@@ -20,7 +20,11 @@ namespace operators { ...@@ -20,7 +20,11 @@ namespace operators {
namespace math { namespace math {
MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim, MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
int num_flatten_cols, bool trans) { int num_flatten_cols, bool trans) {
PADDLE_ENFORCE_GT(tensor_dim.size(), 1); PADDLE_ENFORCE_GT(
tensor_dim.size(), 1,
platform::errors::InvalidArgument("The tensor dim size should be greater "
"than 1, but reveived dim size is %d",
tensor_dim.size()));
MatDescriptor retv; MatDescriptor retv;
if (num_flatten_cols > 1) { if (num_flatten_cols > 1) {
auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols); auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
......
...@@ -60,7 +60,8 @@ struct CUBlas<float> { ...@@ -60,7 +60,8 @@ struct CUBlas<float> {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSgemmStridedBatched(args...)); platform::dynload::cublasSgemmStridedBatched(args...));
#else #else
PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); PADDLE_THROW(platform::errors::Unimplemented(
"SgemmStridedBatched is not supported on cuda <= 7.5"));
#endif #endif
} }
...@@ -85,7 +86,8 @@ struct CUBlas<float> { ...@@ -85,7 +86,8 @@ struct CUBlas<float> {
beta, C, Ctype, ldc)); beta, C, Ctype, ldc));
}); });
#else #else
PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); PADDLE_THROW(platform::errors::Unimplemented(
"cublasSgemmEx is not supported on cuda <= 7.5"));
#endif #endif
} }
...@@ -146,13 +148,15 @@ struct CUBlas<double> { ...@@ -146,13 +148,15 @@ struct CUBlas<double> {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDgemmStridedBatched(args...)); platform::dynload::cublasDgemmStridedBatched(args...));
#else #else
PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); PADDLE_THROW(platform::errors::Unimplemented(
"DgemmStridedBatched is not supported on cuda <= 7.5"));
#endif #endif
} }
template <typename... ARGS> template <typename... ARGS>
static void GEMM_EX(ARGS... args) { static void GEMM_EX(ARGS... args) {
PADDLE_THROW("Currently there are not cublasDgemmEx."); PADDLE_THROW(platform::errors::Unimplemented(
"Currently there are not cublasDgemmEx."));
} }
template <typename... ARGS> template <typename... ARGS>
...@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> { ...@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C), reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
ldc, strideC, batchCount)); ldc, strideC, batchCount));
#else #else
PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); PADDLE_THROW(platform::errors::Unimplemented(
"HgemmStridedBatched is not supported on cuda <= 7.5"));
#endif #endif
} }
...@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> { ...@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
beta, C, Ctype, ldc, computeType, algo)); beta, C, Ctype, ldc, computeType, algo));
}); });
#else #else
PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif #endif
} }
}; };
...@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(
"cublas fp16 gemm requires GPU compute capability >= 53"); context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha); float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta); float h_beta = static_cast<float>(beta);
......
...@@ -29,7 +29,8 @@ template <> ...@@ -29,7 +29,8 @@ template <>
struct CBlas<int8_t> { struct CBlas<int8_t> {
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
PADDLE_THROW("Blas VCOPY don't support int8_t"); PADDLE_THROW(platform::errors::Unimplemented(
"Blas VCOPY do not supported on CPU, please check your code"));
} }
}; };
...@@ -347,22 +348,47 @@ struct CBlas<double> { ...@@ -347,22 +348,47 @@ struct CBlas<double> {
template <> template <>
struct CBlas<platform::float16> { struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } static void GEMM(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 GEMM not supported on CPU, please check your code"));
}
static void SMM_GEMM(...) { static void SMM_GEMM(...) {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); PADDLE_THROW(platform::errors::Unimplemented(
"float16 SMM_GEMM not supported on CPU, please check your code"));
} }
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } static void VMUL(...) {
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } PADDLE_THROW(platform::errors::Unimplemented(
static void VSQUARE(...) { "float16 VMUL not supported on CPU, please check your code"));
PADDLE_THROW("float16 VSQUARE not supported on CPU");
} }
static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } static void VEXP(...) {
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; PADDLE_THROW(platform::errors::Unimplemented(
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; "float16 VEXP not supported on CPU, please check your code"));
static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); }; }
static void VSQUARE(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 VSQUARE not supported on CPU, please check your code"));
}
static void VPOW(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 VPOW not supported on CPU, please check your code"));
}
static void DOT(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 DOT not supported on CPU, please check your code"));
};
static void SCAL(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 SCAL not supported on CPU, please check your code"));
};
static void ASUM(...) {
PADDLE_THROW(platform::errors::Unimplemented(
"float16 ASUM not supported on CPU, please check your code"));
};
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) { static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); PADDLE_THROW(platform::errors::Unimplemented(
"float16 GEMM_BATCH not supported on CPU, please check your code"));
} }
#endif #endif
}; };
...@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a, ...@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
auto dim_a = mat_a.dims(); auto dim_a = mat_a.dims();
auto dim_b = mat_b.dims(); auto dim_b = mat_b.dims();
auto dim_out = mat_out->dims(); auto dim_out = mat_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, PADDLE_ENFORCE_EQ(
"The input and output of matmul be matrix"); dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
PADDLE_ENFORCE( platform::errors::InvalidArgument(
mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), "The input and output of matmul should be matrix, the dim size must "
"The places of matrices must be same"); "be 2,"
"but received dim size input_a:%d, input_b:%d, output:%d",
dim_a.size(), dim_b.size(), dim_out.size()));
PADDLE_ENFORCE_EQ(
mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
platform::errors::InvalidArgument("The places of matrices in the matmul "
"should be same, please check your "
"code."));
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead( ...@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
} }
} else { } else {
PADDLE_ENFORCE_EQ(W1, H2); PADDLE_ENFORCE_EQ(
W1, H2,
platform::errors::InvalidArgument(
"The fisrt matrix width should be same as second matrix height,"
"but received fisrt matrix width %d"
", second matrix height %d",
W1, H2));
int ldc = W2 * head_number; int ldc = W2 * head_number;
int sub_width = W1 / head_number; int sub_width = W1 / head_number;
...@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, ...@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
const framework::Tensor &mat_b, const framework::Tensor &mat_b,
const MatDescriptor &dim_b, T alpha, const MatDescriptor &dim_b, T alpha,
framework::Tensor *mat_out, T beta) const { framework::Tensor *mat_out, T beta) const {
PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_); PADDLE_ENFORCE_EQ(
dim_a.width_, dim_b.height_,
platform::errors::InvalidArgument(
"The fisrt matrix width should be same as second matrix height,"
"but received fisrt matrix width %d"
", second matrix height %d",
dim_a.width_, dim_b.height_));
CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
...@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, ...@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
dim_a.width_, alpha, mat_a.data<T>(), dim_a.width_, alpha, mat_a.data<T>(),
mat_b.data<T>(), beta, mat_out->data<T>()); mat_b.data<T>(), beta, mat_out->data<T>());
} else { } else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || PADDLE_ENFORCE_EQ(
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0, dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
"dim_a.batch_size should be equal to dim_b.batch_size, or " dim_b.batch_size_ == 0,
"one of dim_a.batch_size and dim_b.batch_size should be 0. " true, platform::errors::InvalidArgument(
"But got dim_a.batch_size = %d, dim_b.batch_size = %d.", "dim_a.batch_size should be equal to dim_b.batch_size, or "
dim_a.batch_size_, dim_b.batch_size_); "one of dim_a.batch_size and dim_b.batch_size should be 0. "
"But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
dim_a.batch_size_, dim_b.batch_size_));
this->template BatchedGEMM<T>( this->template BatchedGEMM<T>(
transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha, transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(), mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
...@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a, ...@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
int head_number, int head_number,
framework::Tensor *mat_out, T beta, framework::Tensor *mat_out, T beta,
bool mat_b_split_vertical) const { bool mat_b_split_vertical) const {
PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_GE(head_number, 1); dim_a.width_ % head_number, 0,
PADDLE_ENFORCE_LE(head_number, dim_a.width_); platform::errors::InvalidArgument(
"The first input width must be some times the head number"
"but received first input width %d"
", head_number %d",
dim_a.width_, head_number));
PADDLE_ENFORCE_GE(head_number, 1,
platform::errors::InvalidArgument(
"The head number should be greater equal 1,"
"but received head number %d",
head_number));
PADDLE_ENFORCE_LE(
head_number, dim_a.width_,
platform::errors::InvalidArgument(
"The head number should be less equal first input width,"
"but received first input width %d"
", head_number %d",
dim_a.width_, head_number));
CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
if (mat_b_split_vertical) { if (mat_b_split_vertical) {
PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0); dim_b.height_, dim_a.width_ / head_number,
platform::errors::InvalidArgument(
"The second input height should be equal than first input width,"
"but received second input height %d, first input width %d",
dim_b.height_, dim_a.width_ / head_number));
PADDLE_ENFORCE_EQ(
dim_a.width_ % head_number, 0,
platform::errors::InvalidArgument(
"The second input width should be some times the head number"
"but received second input width %d"
", head_number %d",
dim_b.width_, head_number));
} }
if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
...@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a, ...@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
mat_out->data<T>() + sub_matC_offset, ldc); mat_out->data<T>() + sub_matC_offset, ldc);
} }
} else { } else {
PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ || PADDLE_ENFORCE_EQ(
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0), (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
true); dim_b.batch_size_ == 0),
true,
platform::errors::InvalidArgument(
"The first input batch size should be equal than second input,"
"either two input batch size is 0, but received first input batch "
"size"
" %d, second input batch size %d",
dim_a.batch_size_, dim_b.batch_size_));
this->template BatchedGEMMWithHead<T>( this->template BatchedGEMMWithHead<T>(
transA, transB, dim_a.width_, dim_a.height_, dim_b.width_, transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
......
...@@ -22,10 +22,12 @@ limitations under the License. */ ...@@ -22,10 +22,12 @@ limitations under the License. */
#include <cblas.h> #include <cblas.h>
#endif #endif
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4); ...@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5); DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6); DEFINE_CPU_TRANS(6);
template <typename T>
struct TransposeNormal<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& in, framework::Tensor* out,
const std::vector<int>& axis) {
const int rank = axis.size();
auto in_stride = framework::stride(in.dims());
auto out_stride = framework::stride(out->dims());
const T* in_ptr = in.data<T>();
T* out_ptr = out->data<T>();
auto transpose_helper = [&](int64_t beg, int64_t end) {
for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
int64_t in_idx = 0;
int64_t tmp_idx = out_idx;
// calculate the input index
for (int i = 0; i < rank; ++i) {
const int64_t coordinate = tmp_idx / out_stride[i];
tmp_idx -= coordinate * out_stride[i];
in_idx += coordinate * in_stride[axis[i]];
}
out_ptr[out_idx] = in_ptr[in_idx];
}
};
double cost_per_iteration =
rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
2 * Eigen::TensorOpCost::MulCost<int64_t>() +
2 * Eigen::TensorOpCost::AddCost<int64_t>());
Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
auto* cpu_device = context.eigen_pool_device();
cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
}
};
// define transpose normal
#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
DEFINE_CPU_TRANS_NORMAL(platform::float16);
DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
DEFINE_CPU_TRANS_NORMAL(float);
DEFINE_CPU_TRANS_NORMAL(double);
DEFINE_CPU_TRANS_NORMAL(int);
DEFINE_CPU_TRANS_NORMAL(int64_t);
DEFINE_CPU_TRANS_NORMAL(bool);
DEFINE_CPU_TRANS_NORMAL(int16_t);
DEFINE_CPU_TRANS_NORMAL(uint8_t);
DEFINE_CPU_TRANS_NORMAL(int8_t);
struct TensorSetConstantCPU { struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value) TensorSetConstantCPU(framework::Tensor* tensor, float value)
: tensor_(tensor), value_(value) {} : tensor_(tensor), value_(value) {}
......
...@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
...@@ -23,6 +26,7 @@ namespace operators { ...@@ -23,6 +26,7 @@ namespace operators {
namespace math { namespace math {
using float16 = paddle::platform::float16; using float16 = paddle::platform::float16;
using bfloat16 = paddle::platform::bfloat16;
template struct SetConstant<platform::CUDADeviceContext, platform::float16>; template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
template struct SetConstant<platform::CUDADeviceContext, float>; template struct SetConstant<platform::CUDADeviceContext, float>;
...@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>; ...@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
template struct SetConstant<platform::CUDADeviceContext, int64_t>; template struct SetConstant<platform::CUDADeviceContext, int64_t>;
template struct SetConstant<platform::CUDADeviceContext, bool>; template struct SetConstant<platform::CUDADeviceContext, bool>;
#define DEFINE_GPU_TRANS(RANK) \ #define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \ template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \ template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \ template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>; \ template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \ template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>; template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(1);
...@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4); ...@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
DEFINE_GPU_TRANS(5); DEFINE_GPU_TRANS(5);
DEFINE_GPU_TRANS(6); DEFINE_GPU_TRANS(6);
#define REINTERPRET(T, DST_PTR, SRC_PTR) \
T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
template <typename T>
__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
int64_t element,
const int64_t* in_stride_ptr,
const int64_t* out_stride_ptr,
const int64_t* axis_ptr, int rank) {
CUDA_KERNEL_LOOP(out_idx, element) {
int64_t in_idx = 0;
int64_t tmp_idx = out_idx;
for (int i = 0; i < rank; ++i) {
const int64_t coordinate = tmp_idx / out_stride_ptr[i];
tmp_idx -= coordinate * out_stride_ptr[i];
in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
}
out_ptr[out_idx] = in_ptr[in_idx];
}
}
template <typename T>
struct TransposeNormal<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& in, framework::Tensor* out,
const std::vector<int>& axis) {
const int rank = axis.size();
auto in_stride = framework::stride(in.dims());
auto out_stride = framework::stride(out->dims());
auto* in_ptr = in.data<T>();
auto* out_ptr = out->data<T>();
// copy in_stride, out_stride, axis to gpu device
const platform::CUDAPlace& cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
platform::CPUPlace cpu_place = platform::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
for (int i = 0; i < rank; ++i) {
cpu_buf[i] = in_stride[i];
cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i];
}
memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
const int MAX_GRID_DIM =
context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
int64_t elements = in.numel();
int block_size = (elements >= MAX_BLOCK_DIM)
? MAX_BLOCK_DIM
: (1 << static_cast<int>(std::log2(elements)));
int grid_size = elements / block_size;
grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
rank);
}
};
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
DEFINE_GPU_TRANS_NORMAL(float16);
DEFINE_GPU_TRANS_NORMAL(bfloat16);
DEFINE_GPU_TRANS_NORMAL(float);
DEFINE_GPU_TRANS_NORMAL(double);
DEFINE_GPU_TRANS_NORMAL(int);
DEFINE_GPU_TRANS_NORMAL(int64_t);
DEFINE_GPU_TRANS_NORMAL(bool);
DEFINE_GPU_TRANS_NORMAL(int16_t);
DEFINE_GPU_TRANS_NORMAL(uint8_t);
DEFINE_GPU_TRANS_NORMAL(int8_t);
struct TensorSetConstantGPU { struct TensorSetConstantGPU {
TensorSetConstantGPU(const platform::DeviceContext& context, TensorSetConstantGPU(const platform::DeviceContext& context,
framework::Tensor* tensor, float value) framework::Tensor* tensor, float value)
......
...@@ -26,6 +26,14 @@ limitations under the License. */ ...@@ -26,6 +26,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceContext, typename T>
struct TransposeNormal {
// for dims >= 7 situation
void operator()(const DeviceContext& context, const framework::Tensor& in,
framework::Tensor* out, const std::vector<int>& axis);
};
template <typename DeviceContext, typename T, int Rank> template <typename DeviceContext, typename T, int Rank>
struct Transpose { struct Transpose {
void operator()(const DeviceContext& context, const framework::Tensor& in, void operator()(const DeviceContext& context, const framework::Tensor& in,
......
...@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) ...@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>); rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -15,4 +15,5 @@ limitations under the License. */ ...@@ -15,4 +15,5 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>); rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -18,9 +18,10 @@ limitations under the License. */ ...@@ -18,9 +18,10 @@ limitations under the License. */
#include <set> #include <set>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
namespace paddle { namespace paddle {
...@@ -34,6 +35,110 @@ namespace operators { ...@@ -34,6 +35,110 @@ namespace operators {
} }
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using DDim = framework::DDim;
inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
const std::vector<int>& reduced_dims,
std::vector<int>* perm_axis) {
// check if it's a reduced dim
std::vector<bool> src_dims_check(src_dims.size(), false);
size_t src_size = src_dims.size();
size_t reduce_size = reduced_dims.size();
for (size_t i = 0; i < reduce_size; ++i) {
dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
(*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
src_dims_check[reduced_dims[i]] = true;
}
size_t offset = 0;
for (size_t i = 0; i < src_dims_check.size(); ++i) {
bool is_reduced = src_dims_check[i];
if (!is_reduced) {
(*perm_axis)[offset] = i;
dst_dims->at(offset++) = src_dims[i];
}
}
}
template <typename DeviceContext, typename OutT>
void GetShuffledInput(const framework::ExecutionContext& context,
const Tensor* input, Tensor* shuffled_input,
const std::vector<int>& dims) {
DDim shuffled_dims(input->dims());
std::vector<int> perm_axis(input->dims().size());
GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
shuffled_input->Resize(shuffled_dims);
shuffled_input->mutable_data<OutT>(context.GetPlace());
math::TransposeNormal<DeviceContext, OutT> trans;
trans(context.template device_context<DeviceContext>(), *input,
shuffled_input, perm_axis);
}
inline void GetOriginDimFromShuffled(const DDim& src_dim,
const std::vector<int>& dims,
std::vector<int>* origin_dim) {
DDim shuffled_dims(src_dim);
size_t n = src_dim.size();
std::vector<int> perm_axis(n);
GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
for (size_t i = 0; i < n; ++i) {
(*origin_dim)[perm_axis[i]] = i;
}
}
template <typename DeviceContext, typename OutT, typename Functor>
void HandleLargeDim(const framework::ExecutionContext& context,
const Tensor* input, Tensor* output,
const std::vector<int>& dims, bool keep_dim) {
// shuffle the reduced dim to the end
Tensor shuffled_input;
GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
// transpose to 2D tensor whose shape is {unreduced, reduced}.
const int64_t unreduced = output->numel();
const int64_t reduced = shuffled_input.numel() / unreduced;
shuffled_input.Resize({unreduced, reduced});
DDim output_dim = output->dims();
output->Resize({unreduced});
ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
context.template device_context<DeviceContext>(), shuffled_input, output,
{1}, keep_dim);
output->Resize(output_dim);
}
template <typename DeviceContext, typename T, typename Functor>
void HandleLargeDimGrad(const framework::ExecutionContext& context,
const framework::Tensor* x,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
const std::vector<int>& dims) {
const int64_t unreduced = out->numel();
const int64_t reduced = x->numel() / unreduced;
DDim out_dim(out->dims());
DDim x_dim(x->dims());
// transpose and reshape X
Tensor shuffled_x;
GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
DDim shuffled_dim = shuffled_x.dims();
shuffled_x.Resize({unreduced, reduced});
// reshape dX {unreduced, reduced}
dx->Resize({unreduced, reduced});
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
dx, {1});
// transpose dX
std::vector<int> origin_axis(x_dim.size());
GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
Tensor dx_tmp;
framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
dx_tmp.Resize(shuffled_dim);
dx->Resize(x_dim);
math::TransposeNormal<DeviceContext, T> trans;
trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
origin_axis);
}
template <typename DeviceContext, typename T, typename Functor> template <typename DeviceContext, typename T, typename Functor>
struct ReduceKernelFunctor { struct ReduceKernelFunctor {
...@@ -69,22 +174,27 @@ struct ReduceKernelFunctor { ...@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
} else { } else {
int ndim = input->dims().size(); int ndim = input->dims().size();
int rdim = dims.size(); int rdim = dims.size();
HANDLE_DIM(6, 5); if (ndim > 6) {
HANDLE_DIM(6, 4); HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
HANDLE_DIM(6, 3); dims, keep_dim);
HANDLE_DIM(6, 2); } else {
HANDLE_DIM(6, 1); HANDLE_DIM(6, 5);
HANDLE_DIM(5, 4); HANDLE_DIM(6, 4);
HANDLE_DIM(5, 3); HANDLE_DIM(6, 3);
HANDLE_DIM(5, 2); HANDLE_DIM(6, 2);
HANDLE_DIM(5, 1); HANDLE_DIM(6, 1);
HANDLE_DIM(4, 3); HANDLE_DIM(5, 4);
HANDLE_DIM(4, 2); HANDLE_DIM(5, 3);
HANDLE_DIM(4, 1); HANDLE_DIM(5, 2);
HANDLE_DIM(3, 2); HANDLE_DIM(5, 1);
HANDLE_DIM(3, 1); HANDLE_DIM(4, 3);
HANDLE_DIM(2, 1); HANDLE_DIM(4, 2);
HANDLE_DIM(1, 1); HANDLE_DIM(4, 1);
HANDLE_DIM(3, 2);
HANDLE_DIM(3, 1);
HANDLE_DIM(2, 1);
HANDLE_DIM(1, 1);
}
} }
} }
}; };
...@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> { ...@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
} }
} }
}; };
template <typename DeviceContext, typename OutT, typename Functor> template <typename DeviceContext, typename OutT, typename Functor>
class BoolReduceKernel : public framework::OpKernel<OutT> { class BoolReduceKernel : public framework::OpKernel<OutT> {
public: public:
...@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> { ...@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
int ndim = input->dims().size(); int ndim = input->dims().size();
int rdim = dims.size(); int rdim = dims.size();
// comments for accelerating compiling temporarily. // comments for accelerating compiling temporarily.
// HANDLE_DIM(6, 5); if (ndim > 6) {
// HANDLE_DIM(6, 4); HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
// HANDLE_DIM(6, 3); dims, keep_dim);
// HANDLE_DIM(6, 2); } else {
// HANDLE_DIM(6, 1); HANDLE_DIM(6, 5);
// HANDLE_DIM(5, 4); HANDLE_DIM(6, 4);
// HANDLE_DIM(5, 3); HANDLE_DIM(6, 3);
// HANDLE_DIM(5, 2); HANDLE_DIM(6, 2);
// HANDLE_DIM(5, 1); HANDLE_DIM(6, 1);
HANDLE_DIM(4, 3); HANDLE_DIM(5, 4);
HANDLE_DIM(4, 2); HANDLE_DIM(5, 3);
HANDLE_DIM(4, 1); HANDLE_DIM(5, 2);
HANDLE_DIM(3, 2); HANDLE_DIM(5, 1);
HANDLE_DIM(3, 1); HANDLE_DIM(4, 3);
HANDLE_DIM(2, 1); HANDLE_DIM(4, 2);
HANDLE_DIM(1, 1); HANDLE_DIM(4, 1);
HANDLE_DIM(3, 2);
HANDLE_DIM(3, 1);
HANDLE_DIM(2, 1);
HANDLE_DIM(1, 1);
}
} }
} }
}; };
...@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
context.template device_context<DeviceContext>(), *input0, context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims); *input1, *input2, output, dims);
break; break;
default:
HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
input2, output, dims);
break;
} }
} }
} }
...@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel { ...@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size(); auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of ReduceOp "
"should be less equal than 6. But received X's "
"dimensions = %d, X's shape = [%s].",
x_rank, x_dims));
auto dims = ctx->Attrs().Get<std::vector<int>>("dim"); auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
PADDLE_ENFORCE_GT(dims.size(), 0, PADDLE_ENFORCE_GT(dims.size(), 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel { ...@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
"Out@GRAD", "ReduceOp"); "Out@GRAD", "ReduceOp");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size(); auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6,
platform::errors::InvalidArgument(
"Tensors with rank at most 6 are supported by "
"ReduceOp. Received tensor with rank %d.",
x_rank));
auto dims = ctx->Attrs().Get<std::vector<int>>("dim"); auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) { for (size_t i = 0; i < dims.size(); ++i) {
PADDLE_ENFORCE_LT(dims[i], x_rank, PADDLE_ENFORCE_LT(dims[i], x_rank,
......
...@@ -68,6 +68,6 @@ REGISTER_OPERATOR( ...@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
shape, ops::ShapeOp, ops::ShapeOpMaker, shape, ops::ShapeOp, ops::ShapeOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>, REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>, ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
ops::ShapeKernel<double>); ops::ShapeKernel<double>);
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/operators/shape_op.h"
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
shape, paddle::operators::ShapeKernel<int>, shape, paddle::operators::ShapeKernel<bool>,
paddle::operators::ShapeKernel<int32_t>, paddle::operators::ShapeKernel<int>,
paddle::operators::ShapeKernel<int64_t>, paddle::operators::ShapeKernel<int64_t>,
paddle::operators::ShapeKernel<float>, paddle::operators::ShapeKernel<float>,
paddle::operators::ShapeKernel<double>, paddle::operators::ShapeKernel<double>,
......
...@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel { ...@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size(); const int& dim_size = input_dims.size();
const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis")); int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true, PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
"the axis of topk" "the axis of topk"
...@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel { ...@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
if (axis < 0) axis += dim_size; if (axis < 0) axis += dim_size;
PADDLE_ENFORCE_GE( int k;
k, 1, "the attribute of k in the topk must >= 1, but received %d .", k); auto k_is_tensor = ctx->HasInput("K");
if (k_is_tensor) {
k = -1;
} else {
k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_EQ(k >= 1, true,
"the attribute of k in the topk must >= 1 or be a "
"Tensor, but received %d .",
k);
}
PADDLE_ENFORCE_GE(input_dims.size(), 1, PADDLE_ENFORCE_GE(input_dims.size(), 1,
"input of topk must have >= 1d shape"); "input of topk must have >= 1d shape");
......
...@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx, ...@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
trans6(dev_ctx, in, out, axis); trans6(dev_ctx, in, out, axis);
break; break;
default: default:
PADDLE_THROW(platform::errors::InvalidArgument( // for dim >= 7 situation
"Tensors with rank at most 6 are supported" math::TransposeNormal<DeviceContext, T> trans_normal;
", but received input tensor's rank is %d,", trans_normal(dev_ctx, in, out, axis);
dim));
} }
} }
......
...@@ -12,6 +12,7 @@ limitations under the License. */ ...@@ -12,6 +12,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <set> #include <set>
#include <string> #include <string>
#include <thread> //NOLINT
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
...@@ -23,6 +24,7 @@ limitations under the License. */ ...@@ -23,6 +24,7 @@ limitations under the License. */
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "unsupported/Eigen/CXX11/ThreadPool"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool( ...@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
CPUDeviceContext::CPUDeviceContext() { CPUDeviceContext::CPUDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
InitPoolDevice();
} }
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) { CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
InitPoolDevice();
}
void CPUDeviceContext::InitPoolDevice() {
using EigenEnv = Eigen::StlThreadEnvironment;
using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
int num_threads = std::thread::hardware_concurrency();
eigen_threadpool_.reset(new EigenThreadPool(num_threads));
eigen_pool_device_.reset(
new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
} }
Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
return eigen_pool_device_.get();
}
Place CPUDeviceContext::GetPlace() const { return place_; } Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -41,6 +41,7 @@ limitations under the License. */ ...@@ -41,6 +41,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/stream/cuda_stream.h"
#endif #endif
#define EIGEN_USE_THREADS
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext { ...@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
Eigen::ThreadPoolDevice* eigen_pool_device() const;
Place GetPlace() const override; Place GetPlace() const override;
inline void InitPoolDevice();
private: private:
CPUPlace place_; CPUPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
}; };
template <typename Place> template <typename Place>
......
...@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); ...@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif #endif
......
...@@ -521,3 +521,18 @@ DEFINE_int32( ...@@ -521,3 +521,18 @@ DEFINE_int32(
DEFINE_bool(sort_sum_gradient, false, DEFINE_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of " "Sum gradients by the reverse order of "
"the forward execution sequence."); "the forward execution sequence.");
/**
* Performance related FLAG
* Name: max_inplace_grad_add
* Since Version: 2.0.0
* Value Range: int32, default=0
* Example:
* Note: The maximum number of inplace grad_add.
*/
DEFINE_int32(
max_inplace_grad_add, 0,
"The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that "
"less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
"instead of sum. Default is 0.");
...@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator); ...@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
// others // others
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism); DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname); DECLARE_string(tracer_profile_fname);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// cudnn // cudnn
...@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads, FLAGS_use_mkldnn); FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
......
...@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"fake_quantize_dequantize_moving_average_abs_max", {"fake_quantize_dequantize_moving_average_abs_max",
{"Out", "OutScale", "OutAccum", "OutState"}}, {"Out", "OutScale", "OutAccum", "OutState"}},
{"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"check_finite_and_unscale", {"Out", "FoundInfinite"}}, {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
{"update_loss_scaling", {"update_loss_scaling",
{"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <map> #include <map>
...@@ -22,6 +23,7 @@ limitations under the License. */ ...@@ -22,6 +23,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
...@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
"enable_inplace", "enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; }, [](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"enable_addto",
[](const BuildStrategy &self) { return self.enable_addto_; },
[](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
.def_property( .def_property(
"fuse_all_reduce_ops", "fuse_all_reduce_ops",
[](const BuildStrategy &self) { [](const BuildStrategy &self) {
......
...@@ -121,6 +121,18 @@ function cmake_base() { ...@@ -121,6 +121,18 @@ function cmake_base() {
else else
exit 1 exit 1
fi fi
elif [ "$1" == "cp38-cp38" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
exit 1
fi
fi fi
# delete `gym` to avoid modifying requirements.txt in *.whl # delete `gym` to avoid modifying requirements.txt in *.whl
sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
...@@ -176,6 +188,13 @@ function cmake_base() { ...@@ -176,6 +188,13 @@ function cmake_base() {
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
elif [ "$1" == "cp38-cp38" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
fi fi
else else
pip install -r ${PADDLE_ROOT}/python/requirements.txt pip install -r ${PADDLE_ROOT}/python/requirements.txt
...@@ -514,6 +533,8 @@ EOF ...@@ -514,6 +533,8 @@ EOF
pip3.6 uninstall -y paddlepaddle pip3.6 uninstall -y paddlepaddle
elif [ "$1" == "cp37-cp37m" ]; then elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 uninstall -y paddlepaddle pip3.7 uninstall -y paddlepaddle
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 uninstall -y paddlepaddle
fi fi
set -ex set -ex
...@@ -527,6 +548,8 @@ EOF ...@@ -527,6 +548,8 @@ EOF
pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp37-cp37m" ]; then elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp38-cp38" ]; then
pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi fi
tmpfile_rand=`date +%s%N` tmpfile_rand=`date +%s%N`
tmpfile=$tmp_dir/$tmpfile_rand tmpfile=$tmp_dir/$tmpfile_rand
...@@ -621,6 +644,7 @@ function generate_upstream_develop_api_spec() { ...@@ -621,6 +644,7 @@ function generate_upstream_develop_api_spec() {
git checkout -b develop_base_pr upstream/$BRANCH git checkout -b develop_base_pr upstream/$BRANCH
cmake_gen $1 cmake_gen $1
build $2 build $2
cp ${PADDLE_ROOT}/python/requirements.txt /tmp
git checkout $cur_branch git checkout $cur_branch
generate_api_spec "$1" "DEV" generate_api_spec "$1" "DEV"
...@@ -641,7 +665,12 @@ function generate_api_spec() { ...@@ -641,7 +665,12 @@ function generate_api_spec() {
cd ${PADDLE_ROOT}/build/.check_api_workspace cd ${PADDLE_ROOT}/build/.check_api_workspace
virtualenv .${spec_kind}_env virtualenv .${spec_kind}_env
source .${spec_kind}_env/bin/activate source .${spec_kind}_env/bin/activate
pip install -r ${PADDLE_ROOT}/python/requirements.txt
if [ "$spec_kind" == "DEV" ]; then
pip install -r /tmp/requirements.txt
else
pip install -r ${PADDLE_ROOT}/python/requirements.txt
fi
pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
...@@ -660,7 +689,7 @@ function generate_api_spec() { ...@@ -660,7 +689,7 @@ function generate_api_spec() {
awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
# Use sed to make python2 and python3 sepc keeps the same # Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' $spec_path sed -i 's/arg0: str/arg0: unicode/g' $spec_path
sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
...@@ -930,6 +959,10 @@ function parallel_test_base_gpu() { ...@@ -930,6 +959,10 @@ function parallel_test_base_gpu() {
EOF EOF
set +x set +x
precison_cases=""
if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
fi
EXIT_CODE=0; EXIT_CODE=0;
test_cases=$(ctest -N -V) # get all test cases test_cases=$(ctest -N -V) # get all test cases
exclusive_tests='' # cases list which would be run exclusively exclusive_tests='' # cases list which would be run exclusively
...@@ -959,10 +992,23 @@ set +x ...@@ -959,10 +992,23 @@ set +x
echo $testcase" will only run at night." echo $testcase" will only run at night."
continue continue
fi fi
if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
will_test="false"
for case in $precision_cases; do
if [[ $testcase == $case ]]; then
will_test="true"
break
fi
done
if [[ $will_test == "false" ]]; then
echo $testcase" won't run in PRECISION_TEST mode."
continue
fi
fi
if [[ "$is_multicard" == "" ]]; then if [[ "$is_multicard" == "" ]]; then
# trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist") read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
fi fi
if [[ "$is_exclusive" != "" ]]; then if [[ "$is_exclusive" != "" ]]; then
...@@ -1077,8 +1123,6 @@ set +x ...@@ -1077,8 +1123,6 @@ set +x
done done
fi fi
if [[ "$EXIT_CODE" != "0" ]]; then if [[ "$EXIT_CODE" != "0" ]]; then
if [[ "$failed_test_lists" == "" ]]; then if [[ "$failed_test_lists" == "" ]]; then
echo "========================================" echo "========================================"
...@@ -1223,21 +1267,25 @@ EOF ...@@ -1223,21 +1267,25 @@ EOF
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
fi fi
#ref_paddle2_mv1="" #ref_paddle2_mv1=""
...@@ -1342,6 +1390,22 @@ EOF ...@@ -1342,6 +1390,22 @@ EOF
apt-get clean -y && \ apt-get clean -y && \
rm -f ${ref_paddle37} && \ rm -f ${ref_paddle37} && \
ldconfig ldconfig
EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
xz-utils tk-dev libffi-dev liblzma-dev
RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
apt-get clean -y && \
rm -f ${ref_paddle38} && \
ldconfig
EOF EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# run paddle version to install python packages first # run paddle version to install python packages first
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册